Let's say we want to know more about the tweets of a given user

For example, consider the twitter account of REI, an American retail corporation specialized in outdoor recreation gear: @REI (Recreational Equipment Inc)

What does @REI tweet about?

Step 1: Load the necessary packages

# load packages







Step 2: Collect tweets from REI

# harvest tweets from REI

rei_tweets = userTimeline("REI", n=1000)

# dump tweets information into a data frame

rei_df = twListToDF(rei_tweets)

# get the text

rei_txt = rei_df$text

Step 3: Let's do some text cleaning

# remove retweet entities

rei_clean = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", rei_txt)

# remove Atpeople

rei_clean = gsub("@\\w+", "", rei_clean)

# remove punctuation symbols

rei_clean = gsub("[[:punct:]]", "", rei_clean)

# remove numbers

rei_clean = gsub("[[:digit:]]", "", rei_clean)

# remove links

rei_clean = gsub("http\\w+", "", rei_clean)

Step 4: Create Corpus, apply transformations, and get term-document matrix

# corpus

rei_corpus = Corpus(VectorSource(rei_clean))

# convert to lower case

rei_corpus = tm_map(rei_corpus, tolower)

# remove stoprwords

rei_corpus = tm_map(rei_corpus, removeWords, c(stopwords("english"), "rei"))

# remove extra white-spaces

rei_corpus = tm_map(rei_corpus, stripWhitespace)

# term-document matrix

tdm = TermDocumentMatrix(rei_corpus)

# convert as matrix

m = as.matrix(tdm)

Step 5: We need to keep most frequent terms

For instance, let's keep those words that have a frequency > 90 percentile

# remove sparse terms (word frequency > 90% percentile)

wf = rowSums(m)

m1 = m[wf>quantile(wf,probs=0.9), ]

# remove columns with all zeros

m1 = m1[,colSums(m1)!=0]

# for convenience, every matrix entry must be binary (0 or 1)

m1[m1 > 1] = 1

Step 6: Let's keep exploring by applying a cluster analysis

This will let us discover more about groups of words

# distance matrix with binary distance

m1dist = dist(m1, method="binary")

# cluster with ward method

clus1 = hclust(m1dist, method="ward")

# plot dendrogram

plot(clus1, cex=0.7)

Step 7: For a better visualization, we can apply a

Correspondence Analysis (using package FactoMineR)

# correspondance analysis

rei_ca = CA(m1, graph=FALSE)

# default plot of words

plot(rei_ca$row$coord, type="n", xaxt="n", yaxt="n", xlab="", ylab="")

text(rei_ca$row$coord[,1], rei_ca$row$coord[,2], labels=rownames(m1),


title(main="@REI Correspondence Analysis of tweet words", cex.main=1)

Step 8: To improve the correspondance analysis plot, we can apply a clustering method

like k-means or partitioning around medoids (pam)

# partitioning around medoids iwth 6 clusters

k = 6

# pam clustering

rei_pam = pam(rei_ca$row$coord[,1:2], k)

# get clusters

clusters = rei_pam$clustering

Step 9: Let's try to get a nicer plot

# first we need to define a color palette

gbrew = brewer.pal(8, "Dark2")

# I like to use hsv encoding

gpal = rgb2hsv(col2rgb(gbrew))

# colors in hsv (hue, saturation, value, transparency)

gcols = rep("", k)

for (i in 1:k) {

gcols[i] = hsv(gpal[1,i], gpal[2,i], gpal[3,i], alpha=0.65)


# plot with frequencies

wcex = log10(rowSums(m1))

plot(mca$row$coord, type="n", xaxt="n", yaxt="n", xlab="", ylab="")

title("@REI Correspondence Analysis of tweet words", cex.main=1)

for (i in 1:k)


tmp <- clusters == i

text(mca$row$coord[tmp,1], mca$row$coord[tmp,2],

labels=rownames(m1)[tmp], cex=wcex[tmp],



Step 10: For the ggploters, a similar graphic can be obtained like this

# create data frame

rei_words_df = data.frame(

words = rownames(m1),

dim1 = rei_ca$row$coord[,1],

dim2 = rei_ca$row$coord[,2],

freq = rowSums(m1),

cluster = as.factor(clusters))

# plot

ggplot(rei_words_df, aes(x=dim1, y=dim2, label=words)) +

geom_text(aes(size=freq, colour=cluster), alpha=0.7) +

scale_size_continuous(breaks=seq(20,80,by=10), range=c(3,8)) +

scale_colour_manual(values=brewer.pal(8, "Dark2")) +

labs(x="", y="") +

opts(title = "What does @REI tweet about?",

plot.title = theme_text(size=12),


legend.position = "none",

axis.text.x = theme_blank(),

axis.text.y = theme_blank()


