I had an adjunct gig at LCCC before securing a tenure track position. Why I am proud to say that I teach at LCCC ^_^
library(tm) # text mining and document management
library(stringr) # character manipulation with regular expressions
library(grid) # grid graphics utilities
library(ggplot2) # graphics
library(latticeExtra) # package used for text horizon plot
library(caret) # for confusion matrix function
library(rpart) # tree-structured modeling
library(e1071) # support vector machines
library(randomForest) # random forests
library(rpart.plot) # plot tree-structured model information
library(plyr)
path <- "C:/Users/User/Documents/MSPA/PRE452/review"
path2 <- "C:/Users/User/Documents/MSPA/PRE452/Session7/test"
d <- read.csv("E:/Predict453/RPrograms/AFINN-111.csv",header=FALSE)
d.pos <- d[d[,2] >0,]
d.neg <- d[d[,2] < 0,]
mycorpus <- Corpus(DirSource(directory=path,encoding="ANSI"))
mycorpus <- tm_map(mycorpus,removeNumbers)
mycorpus <- tm_map(mycorpus,tolower)
mycorpus <- tm_map(mycorpus,stripWhitespace)
mycorpus <- tm_map(mycorpus,removeWords,stopwords("english"))
stopwords("english")
mycorpus <- tm_map(mycorpus,removePunctuation)
inspect(mycorpus)
mycorpus.test <- Corpus(DirSource(directory=path2,encoding="ANSI"))
mycorpus.test <- tm_map(mycorpus.test,removeNumbers)
mycorpus.test <- tm_map(mycorpus.test,tolower)
mycorpus.test <- tm_map(mycorpus.test,stripWhitespace)
mycorpus.test <- tm_map(mycorpus.test,removeWords,stopwords("english"))
stopwords("english")
mycorpus.test <- tm_map(mycorpus.test,removePunctuation)
inspect(mycorpus.test)
words <- c("lccc","campus", "will","continu", "tuition", "want","educ", "area", "price","student","the","course","also","with","which","within","think","class","was","way","until","that","than","food")
mycorpus <- tm_map(mycorpus,removeWords,words)
mycorpus <- tm_map(mycorpus,removePunctuation,preserve_intra_word_dashes = TRUE)
mycorpus <- tm_map(mycorpus,stemDocument)
feedback <- DocumentTermMatrix(mycorpus)
inspect(feedback)
findFreqTerms(feedback,5)
findAssocs(feedback,"great",0.5)
mycorpus.test <- tm_map(mycorpus.test,removeWords,words)
mycorpus.test <- tm_map(mycorpus.test,removePunctuation,preserve_intra_word_dashes = TRUE)
mycorpus.test <- tm_map(mycorpus.test,stemDocument)
feedback.test <- DocumentTermMatrix(mycorpus.test)
inspect(feedback.test)
findFreqTerms(feedback.test,5)
findAssocs(feedback.test,"great",0.5)
# -------------------------------------------------------------
# Obtain word frequency and the association between the words
# -------------------------------------------------------------
mycorpus <- str_split(unlist(mycorpus),"\\s+")
top.words.train <- Terms(feedback)
print(top.words.train)
termFrequencyTrain <- colSums(as.matrix(feedback))
termFrequencyTrain <- subset(termFrequencyTrain, termFrequencyTrain>=3)
qplot(names(termFrequencyTrain), termFrequencyTrain, geom="bar", xlab="Terms") + coord_flip()
mycorpus.test <- str_split(unlist(mycorpus.test),"\\s+")
top.words.test <- Terms(feedback.test)
print(top.words)
termFrequencyTest <- colSums(as.matrix(feedback.test))
termFrequencyTest <- subset(termFrequencyTest, termFrequencyTest>=3)
qplot(names(termFrequencyTest), termFrequencyTest, geom="bar", xlab="Terms") + coord_flip()
sent.pos <- sapply(mycorpus,function(x) match(x,d.pos[,1]))
sent.pos <- sapply(mycorpus,function(x) match(x,pos))
score.pos <- sapply(sent.pos,na.omit)
score.pos <- sapply(score.pos,length)
score.pos
sent.neg <- sapply(mycorpus,function(x) match(x,d.neg[,1]))
sent.neg <- sapply(mycorpus,function(x) match(x,neg))
score.neg <- sapply(sent.neg,na.omit)
score.neg <- sapply(score.neg,length)
overall.score <- score.pos - score.neg
overall.score2 <- ifelse(overall.score>=0,1,0)
overall.score3 <- ifelse(overall.score>=0,"GOOD","BAD")
sent <- sapply(mycorpus,function(x) match(x,d[,1]))
scores <- sapply(sent,function(x) d[x,2])
scores <- sapply(scores,na.omit)
scores <- sapply(scores,sum)
scores
mycorpus2 <- cbind(score.pos, score.neg, overall.score, overall.score2, overall.score3, scores)
write.csv(mycorpus2,"E:/PRE452/Session7/sent_scores.csv")
sent.pos <- sapply(mycorpus.test,function(x) match(x,d.pos[,1]))
sent.pos <- sapply(mycorpus.test,function(x) match(x,pos))
score.pos <- sapply(sent.pos,na.omit)
score.pos <- sapply(score.pos,length)
score.pos
sent.neg <- sapply(mycorpus.test,function(x) match(x,d.neg[,1]))
sent.neg <- sapply(mycorpus.test,function(x) match(x,neg))
score.neg <- sapply(sent.neg,na.omit)
score.neg <- sapply(score.neg,length)
overall.score <- score.pos - score.neg
overall.score2 <- ifelse(overall.score>=1,1,0)
overall.score3 <- ifelse(overall.score>=1,"GOOD","BAD")
sent <- sapply(mycorpus.test,function(x) match(x,d[,1]))
scores <- sapply(sent,function(x) d[x,2])
scores <- sapply(scores,na.omit)
scores <- sapply(scores,sum)
scores
mycorpus2.test <- cbind(score.pos, score.neg, overall.score, overall.score2, overall.score3, scores)
write.csv(mycorpus2.test,"E:/PRE452/Session7/sent_scores_test.csv")
# -------------------------------------------------------------
# Principal components analysis
# -------------------------------------------------------------
princomp <- cbind(score.pos, score.neg)
pc.results<-princomp(princomp, cor=TRUE)
print(summary(pc.results))
plot(pc.results,type="lines") # scree plot
pc.results$scores
setwd('C:/Users/User/Documents/MSPA/PRE452/Session7/')
pdf(file = "Biplot of Positive and negative sentiments from training data.pdf", width = 8.5, height = 11)
biplot(pc.results, xlab = "First Pricipal Component", ylab = "Second Principal Component")
dev.off()
pdf(file = "Biplot of Positive and negative sentiments from test data.pdf", width = 8.5, height = 11)
biplot(pc.results, xlab = "First Pricipal Component", ylab = "Second Principal Component")
dev.off()
# ----------------------------------------------------
# Develop predictive models using the training data:
# Regression difference method
# -----------------------------------------------------
# regression method for determining weights on POSITIVE AND NEGATIVE
# fit a regression model to the training data
regression.model <- lm(scores ~ score.pos + score.neg, data = train.data.frame)
print(regression.model) # provides 0.03697 + 2.32994(score.pos) -2.03697(score.neg)
train.data.frame$regression <-
predict(regression.model, newdata = train.data.frame)
# determine the cutoff for regression.difference
try.tree <- rpart(scores ~ regression, data = train.data.frame)
print(try.tree) # note that the first split is at 1.201941
# create a user-defined function for the simple difference method
predict.regression <- function(x) {
if (x >= 1.201941) return("GOOD")
if (x < 1.201941) return("BAD")
}
train.data.frame$pred.regression <- ifelse(train.data.frame$regression >= 1.201941, c("GOOD"), c("BAD"))
crosstab <- table(train.data.frame$pred.regression, train.data.frame$overall.score3)
train.pred.regression.performance<-confusionMatrix(crosstab,positive = "GOOD")
# report full set of statistics relating to predictive accuracy
print(train.pred.regression.performance) # result 63.64 Percent
write.csv(train.data.frame,"E:/PRE452/Session7/train.csv")
cat("\n\nTraining set percentage correctly predicted by regression = ",
sprintf("%1.1f",train.pred.regression.performance$overall[1]*100),
" Percent",sep="")
# regression method for determining weights on POSITIVE AND NEGATIVE
# for the test set we use the model developed on the training set
regression.model <- lm(scores ~ score.pos + score.neg, data = test.data.frame)
print(regression.model) # provides 0.03626 + 2.25(score.pos) -0.05(score.neg)
test.data.frame$regression <-
predict(regression.model, newdata = test.data.frame)
try.tree <- rpart(scores ~ regression, data = test.data.frame)
print(try.tree)
test.data.frame$pred.regression <- ifelse(test.data.frame$regression >= 1.125, c("GOOD"), c("BAD"))
crosstab <- table(test.data.frame$pred.regression, test.data.frame$overall.score3)
test.pred.regression.performance<-confusionMatrix(crosstab,positive = "GOOD")
# report full set of statistics relating to predictive accuracy
print(test.pred.regression.performance) # result 100 Percent
cat("\n\nTest set percentage correctly predicted = ",
sprintf("%1.1f",test.pred.regression.performance$overall[1]*100),
" Percent",sep="")
# --------------------------------------
# Hierarchical clustering
# --------------------------------------
# See which documents are clustered together
train.scale <- scale(feedback)
dist.train <- dist(train.scale, method = "euclidean")
fit.train <- hclust(dist.train, method="ward")
pdf(file = "Hierarchical clustering with training data using the Ward method.pdf", width = 8.5, height = 11)
plot(fit.train)
biplot(fit.train, xlab = "Distance", ylab = "Height")
dev.off()
test.scale <- scale(feedback.test)
dist.test <- dist(test.scale, method = "euclidean")
fit.test <- hclust(dist.test, method="ward")
pdf(file = "Hierarchical clustering with test data using the Ward method.pdf", width = 8.5, height = 11)
plot(fit.test)
biplot(fit.test, xlab = "Distance", ylab = "Height")
dev.off()
# See which words are clustered together
train.mat <- as.matrix(inspect(feedback))
train.mat <- as.matrix(train.words[,-1])
train.mat <- t(train.mat)
dist.train <- dist(scale(train.mat))
fit.train <- hclust(dist.train, method="ward")
pdf(file = "Hierarchical clustering of words from the training data.pdf", width = 8.5, height = 11)
plot(fit.train)
rect.hclust(fit.train, k=5)
(groups <- cutree(fit.train, k=5))
dev.off()
test.mat <- as.matrix(inspect(feedback.test))
test.mat <- as.matrix(test.words[,-1])
test.mat <- t(test.mat)
dist.test <- dist(scale(test.mat))
fit.test <- hclust(dist.test, method="ward")
pdf(file = "Hierarchical clustering of words from the test data.pdf", width = 8.5, height = 11)
plot(fit.test)
rect.hclust(fit.test, k=5)
(groups <- cutree(fit.test, k=5))
dev.off()