LCCC

I had an adjunct gig at LCCC before securing a tenure track position. Why I am proud to say that I teach at LCCC ^_^

library(tm) # text mining and document management

library(stringr) # character manipulation with regular expressions

library(grid) # grid graphics utilities

library(ggplot2) # graphics

library(latticeExtra) # package used for text horizon plot

library(caret) # for confusion matrix function

library(rpart) # tree-structured modeling

library(e1071) # support vector machines

library(randomForest) # random forests

library(rpart.plot) # plot tree-structured model information

library(plyr)

path <- "C:/Users/User/Documents/MSPA/PRE452/review"

path2 <- "C:/Users/User/Documents/MSPA/PRE452/Session7/test"

d <- read.csv("E:/Predict453/RPrograms/AFINN-111.csv",header=FALSE)

d.pos <- d[d[,2] >0,]

d.neg <- d[d[,2] < 0,]

mycorpus <- Corpus(DirSource(directory=path,encoding="ANSI"))

mycorpus <- tm_map(mycorpus,removeNumbers)

mycorpus <- tm_map(mycorpus,tolower)

mycorpus <- tm_map(mycorpus,stripWhitespace)

mycorpus <- tm_map(mycorpus,removeWords,stopwords("english"))

stopwords("english")

mycorpus <- tm_map(mycorpus,removePunctuation)

inspect(mycorpus)

mycorpus.test <- Corpus(DirSource(directory=path2,encoding="ANSI"))

mycorpus.test <- tm_map(mycorpus.test,removeNumbers)

mycorpus.test <- tm_map(mycorpus.test,tolower)

mycorpus.test <- tm_map(mycorpus.test,stripWhitespace)

mycorpus.test <- tm_map(mycorpus.test,removeWords,stopwords("english"))

stopwords("english")

mycorpus.test <- tm_map(mycorpus.test,removePunctuation)

inspect(mycorpus.test)

words <- c("lccc","campus", "will","continu", "tuition", "want","educ", "area", "price","student","the","course","also","with","which","within","think","class","was","way","until","that","than","food")

mycorpus <- tm_map(mycorpus,removeWords,words)

mycorpus <- tm_map(mycorpus,removePunctuation,preserve_intra_word_dashes = TRUE)

mycorpus <- tm_map(mycorpus,stemDocument)

feedback <- DocumentTermMatrix(mycorpus)

inspect(feedback)

findFreqTerms(feedback,5)

findAssocs(feedback,"great",0.5)

mycorpus.test <- tm_map(mycorpus.test,removeWords,words)

mycorpus.test <- tm_map(mycorpus.test,removePunctuation,preserve_intra_word_dashes = TRUE)

mycorpus.test <- tm_map(mycorpus.test,stemDocument)

feedback.test <- DocumentTermMatrix(mycorpus.test)

inspect(feedback.test)

findFreqTerms(feedback.test,5)

findAssocs(feedback.test,"great",0.5)

# -------------------------------------------------------------

# Obtain word frequency and the association between the words

# -------------------------------------------------------------

mycorpus <- str_split(unlist(mycorpus),"\\s+")

top.words.train <- Terms(feedback)

print(top.words.train)

termFrequencyTrain <- colSums(as.matrix(feedback))

termFrequencyTrain <- subset(termFrequencyTrain, termFrequencyTrain>=3)

qplot(names(termFrequencyTrain), termFrequencyTrain, geom="bar", xlab="Terms") + coord_flip()

mycorpus.test <- str_split(unlist(mycorpus.test),"\\s+")

top.words.test <- Terms(feedback.test)

print(top.words)

termFrequencyTest <- colSums(as.matrix(feedback.test))

termFrequencyTest <- subset(termFrequencyTest, termFrequencyTest>=3)

qplot(names(termFrequencyTest), termFrequencyTest, geom="bar", xlab="Terms") + coord_flip()

sent.pos <- sapply(mycorpus,function(x) match(x,d.pos[,1]))

sent.pos <- sapply(mycorpus,function(x) match(x,pos))

score.pos <- sapply(sent.pos,na.omit)

score.pos <- sapply(score.pos,length)

score.pos

sent.neg <- sapply(mycorpus,function(x) match(x,d.neg[,1]))

sent.neg <- sapply(mycorpus,function(x) match(x,neg))

score.neg <- sapply(sent.neg,na.omit)

score.neg <- sapply(score.neg,length)

overall.score <- score.pos - score.neg

overall.score2 <- ifelse(overall.score>=0,1,0)

overall.score3 <- ifelse(overall.score>=0,"GOOD","BAD")

sent <- sapply(mycorpus,function(x) match(x,d[,1]))

scores <- sapply(sent,function(x) d[x,2])

scores <- sapply(scores,na.omit)

scores <- sapply(scores,sum)

scores

mycorpus2 <- cbind(score.pos, score.neg, overall.score, overall.score2, overall.score3, scores)

write.csv(mycorpus2,"E:/PRE452/Session7/sent_scores.csv")

sent.pos <- sapply(mycorpus.test,function(x) match(x,d.pos[,1]))

sent.pos <- sapply(mycorpus.test,function(x) match(x,pos))

score.pos <- sapply(sent.pos,na.omit)

score.pos <- sapply(score.pos,length)

score.pos

sent.neg <- sapply(mycorpus.test,function(x) match(x,d.neg[,1]))

sent.neg <- sapply(mycorpus.test,function(x) match(x,neg))

score.neg <- sapply(sent.neg,na.omit)

score.neg <- sapply(score.neg,length)

overall.score <- score.pos - score.neg

overall.score2 <- ifelse(overall.score>=1,1,0)

overall.score3 <- ifelse(overall.score>=1,"GOOD","BAD")

sent <- sapply(mycorpus.test,function(x) match(x,d[,1]))

scores <- sapply(sent,function(x) d[x,2])

scores <- sapply(scores,na.omit)

scores <- sapply(scores,sum)

scores

mycorpus2.test <- cbind(score.pos, score.neg, overall.score, overall.score2, overall.score3, scores)

write.csv(mycorpus2.test,"E:/PRE452/Session7/sent_scores_test.csv")

# -------------------------------------------------------------

# Principal components analysis

# -------------------------------------------------------------

princomp <- cbind(score.pos, score.neg)

pc.results<-princomp(princomp, cor=TRUE)

print(summary(pc.results))

plot(pc.results,type="lines") # scree plot

pc.results$scores

setwd('C:/Users/User/Documents/MSPA/PRE452/Session7/')

pdf(file = "Biplot of Positive and negative sentiments from training data.pdf", width = 8.5, height = 11)

biplot(pc.results, xlab = "First Pricipal Component", ylab = "Second Principal Component")

dev.off()

pdf(file = "Biplot of Positive and negative sentiments from test data.pdf", width = 8.5, height = 11)

biplot(pc.results, xlab = "First Pricipal Component", ylab = "Second Principal Component")

dev.off()

# ----------------------------------------------------

# Develop predictive models using the training data:

# Regression difference method

# -----------------------------------------------------

# regression method for determining weights on POSITIVE AND NEGATIVE

# fit a regression model to the training data

regression.model <- lm(scores ~ score.pos + score.neg, data = train.data.frame)

print(regression.model) # provides 0.03697 + 2.32994(score.pos) -2.03697(score.neg)

train.data.frame$regression <-

predict(regression.model, newdata = train.data.frame)

# determine the cutoff for regression.difference

try.tree <- rpart(scores ~ regression, data = train.data.frame)

print(try.tree) # note that the first split is at 1.201941

# create a user-defined function for the simple difference method

predict.regression <- function(x) {

if (x >= 1.201941) return("GOOD")

if (x < 1.201941) return("BAD")

}

train.data.frame$pred.regression <- ifelse(train.data.frame$regression >= 1.201941, c("GOOD"), c("BAD"))

crosstab <- table(train.data.frame$pred.regression, train.data.frame$overall.score3)

train.pred.regression.performance<-confusionMatrix(crosstab,positive = "GOOD")

# report full set of statistics relating to predictive accuracy

print(train.pred.regression.performance) # result 63.64 Percent

write.csv(train.data.frame,"E:/PRE452/Session7/train.csv")

cat("\n\nTraining set percentage correctly predicted by regression = ",

sprintf("%1.1f",train.pred.regression.performance$overall[1]*100),

" Percent",sep="")

# regression method for determining weights on POSITIVE AND NEGATIVE

# for the test set we use the model developed on the training set

regression.model <- lm(scores ~ score.pos + score.neg, data = test.data.frame)

print(regression.model) # provides 0.03626 + 2.25(score.pos) -0.05(score.neg)

test.data.frame$regression <-

predict(regression.model, newdata = test.data.frame)

try.tree <- rpart(scores ~ regression, data = test.data.frame)

print(try.tree)

test.data.frame$pred.regression <- ifelse(test.data.frame$regression >= 1.125, c("GOOD"), c("BAD"))

crosstab <- table(test.data.frame$pred.regression, test.data.frame$overall.score3)

test.pred.regression.performance<-confusionMatrix(crosstab,positive = "GOOD")

# report full set of statistics relating to predictive accuracy

print(test.pred.regression.performance) # result 100 Percent

cat("\n\nTest set percentage correctly predicted = ",

sprintf("%1.1f",test.pred.regression.performance$overall[1]*100),

" Percent",sep="")

# --------------------------------------

# Hierarchical clustering

# --------------------------------------

# See which documents are clustered together

train.scale <- scale(feedback)

dist.train <- dist(train.scale, method = "euclidean")

fit.train <- hclust(dist.train, method="ward")

pdf(file = "Hierarchical clustering with training data using the Ward method.pdf", width = 8.5, height = 11)

plot(fit.train)

biplot(fit.train, xlab = "Distance", ylab = "Height")

dev.off()

test.scale <- scale(feedback.test)

dist.test <- dist(test.scale, method = "euclidean")

fit.test <- hclust(dist.test, method="ward")

pdf(file = "Hierarchical clustering with test data using the Ward method.pdf", width = 8.5, height = 11)

plot(fit.test)

biplot(fit.test, xlab = "Distance", ylab = "Height")

dev.off()

# See which words are clustered together

train.mat <- as.matrix(inspect(feedback))

train.mat <- as.matrix(train.words[,-1])

train.mat <- t(train.mat)

dist.train <- dist(scale(train.mat))

fit.train <- hclust(dist.train, method="ward")

pdf(file = "Hierarchical clustering of words from the training data.pdf", width = 8.5, height = 11)

plot(fit.train)

rect.hclust(fit.train, k=5)

(groups <- cutree(fit.train, k=5))

dev.off()

test.mat <- as.matrix(inspect(feedback.test))

test.mat <- as.matrix(test.words[,-1])

test.mat <- t(test.mat)

dist.test <- dist(scale(test.mat))

fit.test <- hclust(dist.test, method="ward")

pdf(file = "Hierarchical clustering of words from the test data.pdf", width = 8.5, height = 11)

plot(fit.test)

rect.hclust(fit.test, k=5)

(groups <- cutree(fit.test, k=5))

dev.off()

Page updated

Google Sites

Report abuse