wordgraphs

Word Graphs talking about

A word graph is somewhat similar to a wordcloud although they are not the same thing.

The following code is adapted from the work of Drew Conway in his analysis of poliscijobrumours

Example of a wordgraph for @Greenpeace

Step 1: load the required pakcages

# required packages

library(tm)

library(igraph)

library(ggplot)

library(RColorBrewer)

Step 2: Get tweets from @Greenpeace

# get tweets from @Greenpeace

gp_tweets = userTimeline("Greenpeace", n=1000)

# extract text

gp_text = sapply(gp_tweets, function(x) x$getText())

Step 3: Create corpus and term-document matrix

# create a corpus via VectorSource

gp_corpus = Corpus(VectorSource(gp_text))

# define list of transformations

gp_stopwords = unique(c(stopwords(), "greenpeace", "via"))

# list of transformations

trans = list(weighting=weightTf, stopwords=gp_stopwords,

removePunctuation=TRUE,

tolower=TRUE,

minWordLength=4,

removeNumbers=TRUE)

# create a term-document matrix

gp_tdm = TermDocumentMatrix(gp_corpus, control=trans)

Step 4: This is an optional step that might help us to get rid of some sparse terms

# Remove sparse terms from matrix

gp_clean = removeSparseTerms(gp_tdm, .995)

# as matrix

gp_clean = as.matrix(gp_clean)

Step 5: Now we need to create the graph

# first create a word affiliations matrix

affi_matrix = gp_clean %*% t(gp_clean)

# then create an adjacency matrix with zeroes in its diagonal

adja_matrix = affi_matrix

diag(adja_matrix) = 0

# Create a graph

gp_graph = graph.adjacency(adja_matrix, weighted=TRUE)

Step 6: In order to plot the graph we need to get the x, y coordinates

# coordinates for visualization

posi_matrix = layout.fruchterman.reingold(gp_graph, list(weightsA=E(gp_graph)$weight))

posi_matrix = cbind(V(gp_graph)$name, posi_matrix)

Step 7: Join all the ingredients in a data frame

# create a data frame

gp_df = data.frame(posi_matrix, stringsAsFactors=FALSE)

names(gp_df) = c("word", "x", "y")

gp_df$x = as.numeric(gp_df$x)

gp_df$y = as.numeric(gp_df$y)

Step 8: Make a first plot

# size effect

se = diag(affi_matrix) / max(diag(affi_matrix))

# plot

par(bg = "gray15")

with(gp_df, plot(x, y, type="n", xaxt="n", yaxt="n", xlab="", ylab="", bty="n"))

with(gp_df, text(x, y, labels=word, cex=log10(diag(affi_matrix)),

col=hsv(0.95, se, 1, alpha=se)))

Step 9: To improve our graph, we can perform a k-means cluster analysis to find groups

# k-means with 7 clusters

words_km = kmeans(cbind(as.numeric(posi_matrix[,2]), as.numeric(posi_matrix[,3])), 7)

# add frequencies and clusters in a data frame

gp_df = transform(gp_df, freq=diag(affi_matrix), cluster=as.factor(words_km$cluster))

row.names(gp_df) = 1:nrow(gp_df)

Step 10: Final plot

# graphic with ggplot

gp_words = ggplot(gp_df, aes(x=x, y=y)) +

geom_text(aes(size=freq, label=gp_df$word, alpha=.90, color=as.factor(cluster))) +

labs(x="", y="") +

scale_size_continuous(breaks = c(10,20,30,40,50,60,70,80,90), range = c(1,8)) +

scale_colour_manual(values=brewer.pal(8, "Dark2")) +

scale_x_continuous(breaks=c(min(gp_df$x), max(gp_df$x)), labels=c("","")) +

scale_y_continuous(breaks=c(min(gp_df$y), max(gp_df$y)), labels=c("","")) +

opts(panel.grid.major=theme_blank(),

legend.position="none",

panel.background=theme_rect(fill="gray10", colour="gray10"),

panel.grid.minor=theme_blank(),

axis.ticks=theme_blank(),

title = "Graph of words from @Greenpeace Tweets - 05/22/2012",

plot.title = theme_text(size=12))

# save the image in pdf format

ggsave(plot=gp_words, filename="Greenpeace_wordgraph.pdf", height=10, width=10)

Page updated

Google Sites

Report abuse