modified cloud
Modified Comparative Cloud wordclouds
Drew Conway's Comparative Cloud
Drew Conway has proposed a very interesting option of wordcloud to compare
the words and terms of two texts (check his better cloud). Following his example,
we can try his approach on tweets from @BarackObama and @MittRomney
Example with tweets from @BarackObama and @MittRomney
Step 1: load packages
# load packages
library(twitteR)
library(tm)
library(ggplot2)
Step 2: Collect tweets from Obama and Romney
# collect tweets
obama_tweets = userTimeline("BarackObama", n=1500)
romney_tweets = userTimeline("MittRomney", n=1500)
# get text
obama_txt = sapply(obama_tweets, function(x) x$getText())
romney_txt = sapply(romney_tweets, function(x) x$getText())
Step 3: Create function to clean text
# clean text function
clean.text <- function(some_txt)
{
some_txt = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", some_txt)
some_txt = gsub("@\\w+", "", some_txt)
some_txt = gsub("[[:punct:]]", "", some_txt)
some_txt = gsub("[[:digit:]]", "", some_txt)
some_txt = gsub("http\\w+", "", some_txt)
some_txt = gsub("[ \t]{2,}", "", some_txt)
some_txt = gsub("^\\s+|\\s+$", "", some_txt)
# define "tolower error handling" function
try.tolower = function(x)
{
y = NA
try_error = tryCatch(tolower(x), error=function(e) e)
if (!inherits(try_error, "error"))
y = tolower(x)
return(y)
}
some_txt = sapply(some_txt, try.tolower)
some_txt = some_txt[some_txt != ""]
names(some_txt) = NULL
return(some_txt)
}
Step 4: Let's clean the text
# clean text
obama_clean = clean.text(obama_txt)
romney_clean = clean.text(romney_txt)
# join cleaned texts in a single vector
obamas = paste(obama_clean, collapse=" ")
romneys = paste(romney_clean, collapse=" ")
oba_rom = c(obamas, romneys)
Step 5: Create corpus and term-document matrix
# Corpus
or_corpus = Corpus(VectorSource(oba_rom))
# remove stopwords
skipwords = c(stopwords("english"), "president", "presidents",
"obama", "obamas", "video", "todays", "reads", "live", "watch")
or_corpus = tm_map(or_corpus, removeWords, skipwords)
# term-document matrix
tdm = TermDocumentMatrix(or_corpus)
Step 6: Create data frames from term-document matrix
# create data frame
or_df = as.data.frame(inspect(tdm))
names(or_df) = c("obama.txt", "romney.txt")
# get rid of low frequency words
or_df = subset(or_df, obama.txt>2 & romney.txt>2)
# calculate frequency differences
or_df$freq.dif = or_df$obama.txt - or_df$romney.txt
# twitted more often by Obama
obama_df = subset(or_df, freq.dif > 0)
# twitted more often by Romney
romney_df = subset(or_df, freq.dif < 0)
# twitted equally
both_df = subset(or_df, freq.dif == 0)
Step 7: Create function to get the words spacing for the plot
# function
optimal.spacing <- function(spaces)
{
if(spaces > 1) {
spacing <- 1 / spaces
if(spaces%%2 > 0) {
lim = spacing * floor(spaces/2)
return(seq(-lim, lim, spacing))
}
else {
lim = spacing * (spaces-1)
return(seq(-lim, lim, spacing*2))
}
}
else {
# add some jitter when 0
return(jitter(0, amount=0.2))
}
}
Step 8: Apply function optimal.spacing
# Get spacing for each frequency type
obama_spacing = sapply(table(obama_df$freq.dif),
function(x) optimal.spacing(x))
romney_spacing = sapply(table(romney_df$freq.dif),
function(x) optimal.spacing(x))
both_spacing = sapply(table(both_df$freq.dif),
function(x) optimal.spacing(x))
Step 9: Add spacing column to data frames
# add spacings
obama_optim = rep(0, nrow(obama_df))
for(n in names(obama_spacing)) {
obama_optim[obama_df$freq.dif == as.numeric(n)] <- obama_spacing[[n]]
}
obama_df = transform(obama_df, Spacing=obama_optim)
romney_optim = rep(0, nrow(romney_df))
for(n in names(romney_spacing)) {
romney_optim[romney_df$freq.dif == as.numeric(n)] <- romney_spacing[[n]]
}
romney_df = transform(romney_df, Spacing=romney_optim)
both_df$Spacing = as.vector(both_spacing)
Step 10: Let's visualize the wordcloud
# use ggplot
ggplot(obama_df, aes(x=freq.dif, y=Spacing)) +
geom_text(aes(size=obama.txt, label=row.names(obama_df),
colour=freq.dif), alpha=0.7, family='Times') +
geom_text(data=romney_df, aes(x=freq.dif, y=Spacing,
label=row.names(romney_df), size=romney.txt, color=freq.dif),
alpha=0.7, family='Times') +
geom_text(data=both_df, aes(x=freq.dif, y=Spacing,
label=row.names(both_df), size=obama.txt, color=freq.dif),
alpha=0.7, family='Times') +
scale_size(range=c(3,11)) +
scale_colour_gradient(low="red3", high="blue3", guide="none") +
scale_x_continuous(breaks=c(min(romney_df$freq.dif), 0, max(obama_df$freq.dif)),
labels=c("Twitted More by Romney","Twitted Equally","Twitted More by Obama")) +
scale_y_continuous(breaks=c(0), labels=c("")) +
labs(x="", y="", size="Word Frequency") +
theme_bw() +
opts(panel.grid.major = theme_blank(),
panel.grid.minor = theme_blank(),
title="Conway's Word Cloud, Tweets (Obama -vs- Romney)",
plot.title = theme_text(family="Times", size=18))
# save plot in pdf
ggsave("Obama_Romney_ModifyCloud.pdf", width=13, height=8, units="in")
© Gaston Sanchez