2.4. Text Mining (R-Programming)

2.4. Text Mining through R-Programming

Text Mining through R allows us to identify the most frequent used keywords in a paragraph of texts. Ryan will deploy text mining package (tm) and the word cloud generator package (wordcloud) to analyze the text and to visualize the keywords as a word cloud.

#Install the required packages

install.packages("tm") #For text mining
install.packages("SnowballC") #For text stemming: reduces words to their root form
install.packages("wordcloud") #For word-cloud generator
install.packages("RColorBrewer") #For color palettes

#OR Install the required packages through:

install.packages(c("tm", "SnowballC", "wordcloud", "RColorBrewer", "RCurl", "XML"))

#Load the required packages

library("tm")
library("SnowballC")
library("wordcloud")
library("RColorBrewer")
library("wordcloud")

#Read the text file

filePath <- "~/Desktop/Text Mining Dataset.csv"
text <- readLines(filePath)

#Load the data as a corpus

docs <- Corpus(VectorSource(Text_Mining_Dataset))

#Inspect the content of the document (optional)

inspect(docs)

#Transformation is performed using tm_map() function to replace, for example, special characters from the text

toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")

#Convert the text to lower case

docs <- tm_map(docs, content_transformer(tolower))

#Remove numbers

docs <- tm_map(docs, removeNumbers)

#Remove english common stopwords

docs <- tm_map(docs, removeWords, stopwords("english"))

#Remove your own stop word

docs <- tm_map(docs, removeWords, c("abc", "cba"))

#Remove punctuations

docs <- tm_map(docs, removePunctuation)

#Eliminate extra white spaces

docs <- tm_map(docs, stripWhitespace)

#Text stemming

docs <- tm_map(docs, stemDocument)

#Build a term-document matrix (a table containing the frequency of the words)

dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)

#Generate the Word cloud

set.seed(1234)
wordcloud(words = d$word, freq = d$freq, min.freq = 1,

max.words=200, random.order=FALSE, rot.per=0.35,

colors=brewer.pal(8, "Dark2"))

#Identify frequent terms in the term-document matrix

findFreqTerms(dtm, lowfreq = 50)

[1] "airlin" "cabin" "crew" "flight" "food" "good" "servic" "singapor"

[9] "seat" "time"

#Analyze the association between frequent terms

findAssocs(dtm, terms = "good", corlimit = 0.3)

$good

wine touchscreen base cathay oldest pacif proactiv remain

0.43 0.42 0.41 0.41 0.41 0.41 0.41 0.41

retir spark step section select exemplari seen terrif

0.41 0.41 0.41 0.34 0.33 0.31 0.31 0.31

video welldesign

0.31 0.31

#Plot word frequencies

barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,

col ="lightblue", main ="Most frequent words",

ylab = "Word frequencies")

Google Sites

Report abuse