given topic

Talking About Some Given Topic talking about

Talking about genetics and genomics

Let's say we are interested in the topic "genetics" and "genomics" and that we want to have an idea of the main terms used in tweets and their possible relationship. An interesting option to achieve this goal is to use what I call a word-graph like the following one:

Step 1: Load the required packages

# load packages

library(XML)

library(tm)

library(igraph)

library(RColorBrewer)

Step 2: Let's get some tweets containing "genetics" and "genomics"

# define twitter search url (following the atom standard)

twitter_url = "http://search.twitter.com/search.atom?"

# encode query

query = URLencode("genetics AND genomics")

# vector to store results

tweets = character(0)

# paginate 17 times to harvest tweets

for (page in 1:17)

{

# create twitter search query to be parsed

twitter_search = paste(twitter_url, "q=", query,

"&rpp=100&lang=en&page", page, sep="")

# let's parse with xmlParseDoc

tmp = xmlParseDoc(twitter_search, asText=FALSE)

# extract titles

tweets = c(tweets, xpathSApply(tmp, "//s:entry/s:title",

xmlValue, namespaces=c('s'='http://www.w3.org/2005/Atom')))

}

Step 3: Let's pre-process the data (cleaning)

results = tweets

# remove retweet entities

results = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", results)

# remove at people

results = gsub("@\\w+", "", results)

# remove punctuation

results = gsub("[[:punct:]]", "", results)

# remove numbers

results = gsub("[[:digit:]]", "", results)

# remove html links

results = gsub("http\\w+", "", results)

# remove unnecessary spaces

results = gsub("[ \t]{2,}", "", results)

results = gsub("^\\s+|\\s+$", "", results)

Step 3.1: Sometimes the function tolower doesn't behave as we expect, and it returns weird error messages. That's why we need a modify version that skip those errors

# define "tolower error handling" function

tryTolower = function(x)

{

# create missing value

y = NA

# tryCatch error

try_error = tryCatch(tolower(x), error=function(e) e)

# if not an error

if (!inherits(try_error, "error"))

y = tolower(x)

# result

return(y)

}

# lower case using tryTolower with sapply

results = sapply(results, tryTolower)

names(results) = NULL

# remove empty results (if any)

results = results[results != ""]

Step 4: Create Lexical Corpus and term-document matrix

# create corpus

corpus = Corpus(VectorSource(results))

# remove stopwords

skipwords = c(stopwords("english"),

"genetics", "genomics", "genetic", "genome")

corpus = tm_map(corpus, removeWords, skipwords)

# term-document matrix

tdm = TermDocumentMatrix(corpus)

# convert tdm to matrix

m = as.matrix(tdm)

Step 5: Matrix with frequent words

# word counts

wc = rowSums(m)

# get those words above the 3rd quantile

lim = quantile(wc, probs=0.5)

good = m[wc > lim,]

# remove columns (docs) with zeroes

good = good[,colSums(good)!=0]

Step 6: Obtain an adjacency matrix and create a graph

# adjacency matrix

M = good %*% t(good)

# set zeroes in diagonal

diag(M) = 0

# graph

g = graph.adjacency(M, weighted=TRUE, mode="undirected",

add.rownames=TRUE)

# layout

glay = layout.fruchterman.reingold(g)

# let's superimpose a cluster structure with k-means clustering

kmg = kmeans(M, centers=8)

gk = kmg$cluster

Step 7: Prepare a nice color palette

# create nice colors for each cluster

gbrew = c("red", brewer.pal(8, "Dark2"))

gpal = rgb2hsv(col2rgb(gbrew))

gcols = rep("", length(gk))

for (k in 1:8) {

gcols[gk == k] = hsv(gpal[1,k], gpal[2,k], gpal[3,k], alpha=0.5)

}

Step 8: Create the graph to see what people is talking about

# prepare ingredients for plot

V(g)$size = 10

V(g)$label = V(g)$name

V(g)$degree = degree(g)

#V(g)$label.cex = 1.5 * log10(V(g)$degree)

V(g)$label.color = hsv(0, 0, 0.2, 0.55)

V(g)$frame.color = NA

V(g)$color = gcols

E(g)$color = hsv(0, 0, 0.7, 0.3)

# plot

plot(g, layout=glay)

title("\nGraph of tweets about genetics and genomics",

col.main="gray40", cex.main=1.5, family="serif")

© Gaston Sanchez