Talking About Some Given Topic    talking about

Talking about genetics and genomics
Let's say we are interested in the topic "genetics" and "genomics" and that we want to have an idea of the main terms used in tweets and their possible relationship. An interesting option to achieve this goal is to use what I call a word-graph like the following one:



Step 1: Load the required packages
# load packages
library(XML)
library(tm)
library(igraph)
library(RColorBrewer)


Step 2: Let's get some tweets containing "genetics" and "genomics"
# define twitter search url (following the atom standard)
twitter_url = "http://search.twitter.com/search.atom?"

# encode query
query = URLencode("genetics AND genomics")

# vector to store results
tweets = character(0)

# paginate 17 times to harvest tweets
for (page in 1:17)
{
   # create twitter search query to be parsed
   twitter_search = paste(twitter_url, "q=", query,
      "&rpp=100&lang=en&page", page, sep="")

   # let's parse with xmlParseDoc
   tmp = xmlParseDoc(twitter_search, asText=FALSE)

   # extract titles
   tweets = c(tweets, xpathSApply(tmp, "//s:entry/s:title",
      xmlValue, namespaces=c('s'='http://www.w3.org/2005/Atom')))
}


Step 3: Let's pre-process the data (cleaning)
results = tweets
# remove retweet entities
results = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", results)
# remove at people
results = gsub("@\\w+", "", results)
# remove punctuation
results = gsub("[[:punct:]]", "", results)
# remove numbers
results = gsub("[[:digit:]]", "", results)
# remove html links
results = gsub("http\\w+", "", results)
# remove unnecessary spaces
results = gsub("[ \t]{2,}", "", results)
results = gsub("^\\s+|\\s+$", "", results)


Step 3.1: Sometimes the function tolower doesn't behave as we expect, and it returns weird error messages. That's why we need a modify version that skip those errors
# define "tolower error handling" function 
tryTolower = function(x)
{
   # create missing value
   y = NA
   # tryCatch error
   try_error = tryCatch(tolower(x), error=function(e) e)
   # if not an error
   if (!inherits(try_error, "error"))
      y = tolower(x)
   # result
   return(y)
}

# lower case using tryTolower with sapply 
results = sapply(results, tryTolower)
names(results) = NULL

# remove empty results (if any)
results = results[results != ""]


Step 4: Create Lexical Corpus and term-document matrix
# create corpus
corpus = Corpus(VectorSource(results))

# remove stopwords
skipwords = c(stopwords("english"), 
   "genetics", "genomics", "genetic", "genome")
corpus = tm_map(corpus, removeWords, skipwords)

# term-document matrix
tdm = TermDocumentMatrix(corpus)
# convert tdm to matrix
m = as.matrix(tdm)


Step 5: Matrix with frequent words
# word counts
wc = rowSums(m)

# get those words above the 3rd quantile
lim = quantile(wc, probs=0.5)
good = m[wc > lim,]

# remove columns (docs) with zeroes
good = good[,colSums(good)!=0]


Step 6: Obtain an adjacency matrix and create a graph
# adjacency matrix
M = good %*% t(good)

# set zeroes in diagonal
diag(M) = 0

# graph
g = graph.adjacency(M, weighted=TRUE, mode="undirected",
add.rownames=TRUE)
# layout
glay = layout.fruchterman.reingold(g)

# let's superimpose a cluster structure with k-means clustering
kmg = kmeans(M, centers=8)
gk = kmg$cluster


Step 7: Prepare a nice color palette
# create nice colors for each cluster
gbrew = c("red", brewer.pal(8, "Dark2"))
gpal = rgb2hsv(col2rgb(gbrew))
gcols = rep("", length(gk))
for (k in 1:8) {
gcols[gk == k] = hsv(gpal[1,k], gpal[2,k], gpal[3,k], alpha=0.5)
}


Step 8: Create the graph to see what people is talking about
# prepare ingredients for plot
V(g)$size = 10
V(g)$label = V(g)$name
V(g)$degree = degree(g)
#V(g)$label.cex = 1.5 * log10(V(g)$degree)
V(g)$label.color = hsv(0, 0, 0.2, 0.55)
V(g)$frame.color = NA
V(g)$color = gcols
E(g)$color = hsv(0, 0, 0.7, 0.3)

# plot
plot(g, layout=glay)
title("\nGraph of tweets about genetics and genomics",
col.main="gray40", cex.main=1.5, family="serif")



Comments