R: Statistical Analysis and Visual Display

Introduction

Red Hen's Edge and Edge2 Search Engines and her command-line tools (peck and family) produce output in comma-separated files designed to be imported straight into R.

A syllabus for learning to use the statistical software package R on Red Hen data is available at https://docs.google.com/document/d/1tP3j3ftw_d_iqdpLNnl5M53AhaHFcdz0QpHR4BvprP4/edit

Resources


Collaboration resources


Packages and extensions

Development framework

Tutorials

Using R

Start, help, and end

R           # start R in Linux (installed on Roma)
?keyword    # get help on a function (guessing is ok)
?plot(type) # get help on a parameter within a function
q()         # quit

Install packages

Installed by root on cartago (available to all)

> install.packages("ggplot2")
> install.packages("histogram")
> install.packages("tm") -- the text mining library
> install.packages("stm") -- Estimation of the Structural Topic Model (topic modeling)

Import files

?read.csv   # help

Set working directory (can also be set in /etc/R/Rprofile.site)

> setwd("/home/csa/Pattern2.6")

Read a csv file

df <- read.csv(inFile, header=TRUE, comment.char="#", stringsAsFactors=FALSE)
> eb<-read.csv("/Users/steen/tna/csv/Erin_Burnett.csv",header=TRUE)
> ebpositive = eb$Positive
> ebsubjective = eb$Subjective
> ng<-read.csv("/Users/steen/tna/csv/Nancy_Grace.csv",header=TRUE)
> ngpositive = ng$Positive
> ngsubjective = ng$Subjective
> List3 <- read.table("file://tmp/list3", sep=" ", header=TRUE)    # space-separated table
> heat <-read.csv("/Users/steen/tna/csv/2014-07-22_Heatmap-sentiment.csv",header=TRUE)
> SMT1Positive = heat$SMT_01_positivity
> library(ggplot2)
> library(scales)
> heatdates <- as.Date(eb$Date)

In little r (from Dirk Edelbuettel)

$ r -e'dat <- read.csv("/tmp/david.csv"); \
  res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11])); print(res)' -

Remove objects

> rm(days,mydates,sdate3)

select variables v1, v2, v3

 myvars <- c("v1", "v2", "v3")
 newdata <- mydata[myvars]

another method

 myvars <- paste("v", 1:3, sep="")
 newdata <- mydata[myvars]

select 1st and 5th through 10th variables

 newdata <- mydata[c(1,5:10)] 

Select a column

> heat <- read.csv("/Users/steen/tna/csv/2014-07-22_Heatmap-sentiment.csv", header = TRUE, as.is = TRUE)
> heat20040603 <- subset(heat,Date <= "2004-06-03")   # Works if you use as.is = TRUE
> FaF  <- subset(heat,Show = "Fox_and_Friends")  # Does not work
> library(sqldf)
> FaF <- read.csv.sql(heat, sql = 'select * from file where Show = "Fox_and_Friends"') # Fails

Numerical ranges

newdata <- subset(mydata, age >= 20 | age < 10, select=c(ID, Weight))

Select rows by value range and range of columns by order

newdata <- subset(mydata, sex=="m" & age > 25, select=weight:income)
CNN <- subset(heat, Network=="CNN", select=Date:SMT_01_positivity)
cnnebofhm<- subset(hm, Network=="CNN" & Show=="Erin_Burnett_Out_Front")

Show data

Getting information on a dataset

Show objects

> ls()

Show variables / headers

 > names(heat)
 [1] "Date"                "Hour"                "Country"            
 [4] "Network"             "Show"                "Caption_wc"         
 [7] "SMT_01_wc"           "SMT_01_positivity"   "SMT_01_subjectivity"
[10] "SMT_02_wc"           "SMT_02_positivity"   "SMT_02_subjectivity"

Show third column name

 > colnames(heat)[3]

Data structure (useful for spotting inconsistencies in the data)

 > str(heat)
'data.frame':   230893 obs. of  12 variables:
$ Date               : Date, format: "2004-06-01" "2004-06-02" ...
$ Hour               : int  1300 1300 1300 1300 1300 1300 1300 1300 1300 1300 ...
$ Country            : Factor w/ 13 levels "AF","BE","CA",..: 13 13 13 13 13 13 13 13 13 13 ...
$ Network            : Factor w/ 39 levels "AlJazeera","BBC",..: 39 39 39 39 39 39 39 39 39 39 ...
$ Show               : Factor w/ 1873 levels "1600_Pennsylvania_Avenue",..: 442 442 442 442 442 442 442 ...
$ Caption_wc         : int  5336 6946 8908 5366 7538 6582 7410 6261 6778 4122 ...
$ SMT_01_wc          : int  331 467 636 354 532 427 430 448 474 249 ...
$ SMT_01_positivity  : num  24.4 10.5 49.5 34.3 35.2 ...
$ SMT_01_subjectivity: num  138 202 273 174 219 ...
$ SMT_02_wc          : int  1019 1364 1778 1032 1529 1291 1384 1237 1318 733 ...
$ SMT_02_positivity  : num  13 -26.38 8.38 -1.25 2 ...
$ SMT_02_subjectivity: num  185 266 358 222 276 ...

Print first column

 > heat[1]

Print second and fourth column

 > heat[c(2,4)]

Print first 10 rows

 > head(heat, n=10)

Print last 10 rows of second and fourth column

 > tail(heat[c(2,4)], n=10)

(today <- Sys.Date())
format(today, "%d %b %Y")  # with month as a word
(tenweeks <- seq(today, length.out=10, by="1 week")) # next ten weeks
weekdays(today)
months(tenweeks)

Convert a date column to dates

 heat$Date <- as.Date(heat$Date , "%Y-%m-%d")

Show first date

> head(heat[1], n=1)
       Date
1 2004-06-01

Show last date

> tail(heat[1], n=1)
            Date
230893 2014-07-20

Show days between two dates

mydates <- as.Date(c("2007-06-22", "2004-02-13"))
days <- mydates[1] - mydates[2]

Basic functions

Rename a column

> colnames(heat)[3] <- "Network"

For loops

for (x in c(1:10)) print(sqrt(x))
for (x in c(0:400)) print(x^2))

History

history()
savehistory(file = "name")
loadhistory(file = "name")

Mean, var, sd

On the fly from a file

dat <- read.csv("smt2csv-02.py-output.csv", header=FALSE)
res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11]))

On the fly from stdin, using little r

cat smt2csv-02.py-output.csv | r -e'dat <- read.csv(file("stdin")); \
  res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11])); print(res)' -

Shortened version from 2014-07-26 update:

cat smt2csv-02.py-output.csv |\ r -d -e'\
  res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11])); print(res)' -

Individually

> mean(ebpositive)
[1] 0.03487137
> var(ebpositive)
[1] 0.1058562
> sd(ebpositive)
[1] 0.3253556
> mean(ebsubjective)
[1] 0.2791462
> var(ebsubjective)
[1] 0.0977887
> sd(ebsubjective)
[1] 0.3127119
> mean(ngpositive)
[1] 0.02013443
> var(ngpositive)
[1] 0.1135885
> sd(ngpositive)
[1] 0.3370289
> mean(ngsubjective)
[1] 0.2875757
> var(ngsubjective)
[1] 0.09846148
> sd(ngsubjective)
[1] 0.3137857

Boxplot

> boxplot(eb$Positive, ng$Positive, col="lightblue")
> boxplot(eb$Subjective, ng$Subjective, col="red")

Histogram

> library(histogram)
> histogram(eb$Positive)

Scripting

Rscript

Rscript is included in R and is the default method of scripting. You can run Rscripts from the commandline or from a bash script.

littler

Little r is an alternative and more flexible scripting system for R written by Dirk Edelbuettel. With r, you can do things like read piped output; you can also run r commands straight inside bash scripts.

Obsolete style -- run a script silently or verbosely

R --vanilla --slave <delay.R               # or use --save to keep history
R --vanilla --verbose <faculty-teaching.R  # get some debugging information

Vectors

Define a vector and display the contents

x <- c(0:401)
x

Generate a vector with cumulative values from a vector or a matrix

Cumulative <- cumsum(x)
Cumulative <- cumsum(WordCount$V2)

Find vector names in a matrix

names(List3)

Echo the content of a vector in a matrix

List3$V1

Pair vectors from two matrices

TwoColumns <- paste(Record$V1,Delay$V1)

Set a default matrix

attach(List3)

Plot

Plot vectors

plot(x,sample(x)) # plot the vector against a randomized version of itself
plot(x,sample(x)^20,xlab="x",ylab="y",main="This is my first Rtwork")
plot(y=Delay$V1,x=Record$V1)

Plot the data

plot(soc$N, (soc$N * soc$Students),
 ylab="Cumulative enrollment over the last four quarters",
 xlab="Number of classes taught",
 col.axis = "sky blue",
 col.lab = "grey45",
 pch = 19,        # plotting character -- 19 is a circle
 cex = 1,         # or cex 1:10/5 if you want increasing sizes
 col = green,     # If you want green circles for data
 # type = "n",    # If you don't want to plot points yet
 xlim = c(0,7),   # Set the x-axis scale
 ylim = c(0,550)) # It may be possible to use 

Add a title, using legend colors green and blue

mtext(c("Ladder faculty teaching in ", "Comm", " and ", "Soc"),
 cex = 1.5,        # scale text size
 col = c(1,4,1,3), # 0 white, 1 black, 2 red, 3 green, 4 blue
 at = c(2,4.4,5.27,5.9),  # clumsy solution -- must be manually adjusted
 line = 2)

Add a subtitle

mtext(c("2008-2009 undergraduate classes, excluding sections and fiat lux"), line = 0.7)

Add labeled data points (offset left and right)

text ( (soc$N - 0.3), (soc$N * soc$Students), labels = soc$Name, col = "green" )
text ( (com$N + 0.3), (com$N * com$Students), labels = com$Name, col = "blue" )

Add points to an existing plot

points(soc$N, (soc$N * soc$Students))

Get yesterday's date

day <- Sys.Date() - 1

Format the date

day <- format(day, "%d %B %Y")

Add the formatted date to the title

title <- bquote(bold(paste("Postprocessing times on ", .(day))))

Admittedly a terrible syntax for adding a variable to a string

var <-  bquote(paste("string", .(var)))

Plot a vector without x-axis labels and add the title string and color

plot(Delay$V1, xaxt="n", xlab="Time of recording", ylab="Delay in minutes",
main=title, col="blue")

Add the times into the x-axis lables

axis(1, at=1:(length(Delay$V1)), labels=Record$V1, tick=FALSE, col="red")

Cumulative word count with area graph

plot(cumsum(WordCount$V2), xaxt="n", xlab="Dates", ylab="Words", main=title,
pch = 19, cex = -1 , col = "green", type="h")

Histogram

hist(V1,ylab="Frequency",xlab="Delay in minutes", main=title)

Write

x <- matrix(1:10,ncol=5)
write(t(x))   -- writes x to a file called data (default file name)
write(x, "", sep = "\t")   -- writes x tab-delimited to the screen (not to file)
write(x, "myfile", sep = "\t")   -- writes x tab-delimited to the file myfile

Cool R ideas

MatchIt

maps

tag cloud

access Amazon, NYT, Google Trends

flash output

audio search

R Documentation

Reference

Tutorials

Blogs

Installing R

Many R components have been packaged for Debian, starting with r-base-core, but packages can also be downloaded directly from within R itself, and install into /usr/local/lib/R/site-library. Cran download site USA (CA 2) is http://cran.stat.ucla.edu.

Debian packages

Types of R packages in Debian:

r-base*
r-block*
r-cran*
r-doc*
r-mathlib
r-other*

Recommended for installation:

r-recommended

animation

install.packages("animation")

ggplot2

install.packages("ggplot2")