Red Hen's Edge and Edge2 Search Engines and her command-line tools (peck and family) produce output in comma-separated files designed to be imported straight into R.
A syllabus for learning to use the statistical software package R on Red Hen data is available at https://docs.google.com/document/d/1tP3j3ftw_d_iqdpLNnl5M53AhaHFcdz0QpHR4BvprP4/edit
https://catalog.data.gov/dataset/animal-care-and-control-adopted-animals
https://www.kaggle.com/uciml/iris
Collaboration resources
Packages and extensions
Development framework
Tutorials
Start, help, and end
R # start R in Linux (installed on Roma)
?keyword # get help on a function (guessing is ok)
?plot(type) # get help on a parameter within a function
q() # quit
Installed by root on cartago (available to all)
> install.packages("ggplot2")
> install.packages("histogram")
> install.packages("tm") -- the text mining library
> install.packages("stm") -- Estimation of the Structural Topic Model (topic modeling)
?read.csv # help
Set working directory (can also be set in /etc/R/Rprofile.site)
> setwd("/home/csa/Pattern2.6")
Read a csv file
df <- read.csv(inFile, header=TRUE, comment.char="#", stringsAsFactors=FALSE)
> eb<-read.csv("/Users/steen/tna/csv/Erin_Burnett.csv",header=TRUE)
> ebpositive = eb$Positive
> ebsubjective = eb$Subjective
> ng<-read.csv("/Users/steen/tna/csv/Nancy_Grace.csv",header=TRUE)
> ngpositive = ng$Positive
> ngsubjective = ng$Subjective
> List3 <- read.table("file://tmp/list3", sep=" ", header=TRUE) # space-separated table
> heat <-read.csv("/Users/steen/tna/csv/2014-07-22_Heatmap-sentiment.csv",header=TRUE)
> SMT1Positive = heat$SMT_01_positivity
> library(ggplot2)
> library(scales)
> heatdates <- as.Date(eb$Date)
In little r (from Dirk Edelbuettel)
$ r -e'dat <- read.csv("/tmp/david.csv"); \
res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11])); print(res)' -
Remove objects
> rm(days,mydates,sdate3)
select variables v1, v2, v3
myvars <- c("v1", "v2", "v3")
newdata <- mydata[myvars]
another method
myvars <- paste("v", 1:3, sep="")
newdata <- mydata[myvars]
select 1st and 5th through 10th variables
newdata <- mydata[c(1,5:10)]
Select a column
> heat <- read.csv("/Users/steen/tna/csv/2014-07-22_Heatmap-sentiment.csv", header = TRUE, as.is = TRUE)
> heat20040603 <- subset(heat,Date <= "2004-06-03") # Works if you use as.is = TRUE
> FaF <- subset(heat,Show = "Fox_and_Friends") # Does not work
> library(sqldf)
> FaF <- read.csv.sql(heat, sql = 'select * from file where Show = "Fox_and_Friends"') # Fails
Numerical ranges
newdata <- subset(mydata, age >= 20 | age < 10, select=c(ID, Weight))
Select rows by value range and range of columns by order
newdata <- subset(mydata, sex=="m" & age > 25, select=weight:income)
CNN <- subset(heat, Network=="CNN", select=Date:SMT_01_positivity)
cnnebofhm<- subset(hm, Network=="CNN" & Show=="Erin_Burnett_Out_Front")
Getting information on a dataset
Show objects
> ls()
Show variables / headers
> names(heat)
[1] "Date" "Hour" "Country"
[4] "Network" "Show" "Caption_wc"
[7] "SMT_01_wc" "SMT_01_positivity" "SMT_01_subjectivity"
[10] "SMT_02_wc" "SMT_02_positivity" "SMT_02_subjectivity"
Show third column name
> colnames(heat)[3]
Data structure (useful for spotting inconsistencies in the data)
> str(heat)
'data.frame': 230893 obs. of 12 variables:
$ Date : Date, format: "2004-06-01" "2004-06-02" ...
$ Hour : int 1300 1300 1300 1300 1300 1300 1300 1300 1300 1300 ...
$ Country : Factor w/ 13 levels "AF","BE","CA",..: 13 13 13 13 13 13 13 13 13 13 ...
$ Network : Factor w/ 39 levels "AlJazeera","BBC",..: 39 39 39 39 39 39 39 39 39 39 ...
$ Show : Factor w/ 1873 levels "1600_Pennsylvania_Avenue",..: 442 442 442 442 442 442 442 ...
$ Caption_wc : int 5336 6946 8908 5366 7538 6582 7410 6261 6778 4122 ...
$ SMT_01_wc : int 331 467 636 354 532 427 430 448 474 249 ...
$ SMT_01_positivity : num 24.4 10.5 49.5 34.3 35.2 ...
$ SMT_01_subjectivity: num 138 202 273 174 219 ...
$ SMT_02_wc : int 1019 1364 1778 1032 1529 1291 1384 1237 1318 733 ...
$ SMT_02_positivity : num 13 -26.38 8.38 -1.25 2 ...
$ SMT_02_subjectivity: num 185 266 358 222 276 ...
Print first column
> heat[1]
Print second and fourth column
> heat[c(2,4)]
Print first 10 rows
> head(heat, n=10)
Print last 10 rows of second and fourth column
> tail(heat[c(2,4)], n=10)
(today <- Sys.Date())
format(today, "%d %b %Y") # with month as a word
(tenweeks <- seq(today, length.out=10, by="1 week")) # next ten weeks
weekdays(today)
months(tenweeks)
Convert a date column to dates
heat$Date <- as.Date(heat$Date , "%Y-%m-%d")
Show first date
> head(heat[1], n=1)
Date
1 2004-06-01
Show last date
> tail(heat[1], n=1)
Date
230893 2014-07-20
Show days between two dates
mydates <- as.Date(c("2007-06-22", "2004-02-13"))
days <- mydates[1] - mydates[2]
Rename a column
> colnames(heat)[3] <- "Network"
For loops
for (x in c(1:10)) print(sqrt(x))
for (x in c(0:400)) print(x^2))
History
history()
savehistory(file = "name")
loadhistory(file = "name")
On the fly from a file
dat <- read.csv("smt2csv-02.py-output.csv", header=FALSE)
res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11]))
On the fly from stdin, using little r
cat smt2csv-02.py-output.csv | r -e'dat <- read.csv(file("stdin")); \
res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11])); print(res)' -
Shortened version from 2014-07-26 update:
cat smt2csv-02.py-output.csv |\ r -d -e'\
res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11])); print(res)' -
Individually
> mean(ebpositive)
[1] 0.03487137
> var(ebpositive)
[1] 0.1058562
> sd(ebpositive)
[1] 0.3253556
> mean(ebsubjective)
[1] 0.2791462
> var(ebsubjective)
[1] 0.0977887
> sd(ebsubjective)
[1] 0.3127119
> mean(ngpositive)
[1] 0.02013443
> var(ngpositive)
[1] 0.1135885
> sd(ngpositive)
[1] 0.3370289
> mean(ngsubjective)
[1] 0.2875757
> var(ngsubjective)
[1] 0.09846148
> sd(ngsubjective)
[1] 0.3137857
Boxplot
> boxplot(eb$Positive, ng$Positive, col="lightblue")
> boxplot(eb$Subjective, ng$Subjective, col="red")
Histogram
> library(histogram)
> histogram(eb$Positive)
Rscript
Rscript is included in R and is the default method of scripting. You can run Rscripts from the commandline or from a bash script.
littler
Little r is an alternative and more flexible scripting system for R written by Dirk Edelbuettel. With r, you can do things like read piped output; you can also run r commands straight inside bash scripts.
Obsolete style -- run a script silently or verbosely
R --vanilla --slave <delay.R # or use --save to keep history
R --vanilla --verbose <faculty-teaching.R # get some debugging information
Define a vector and display the contents
x <- c(0:401)
x
Generate a vector with cumulative values from a vector or a matrix
Cumulative <- cumsum(x)
Cumulative <- cumsum(WordCount$V2)
Find vector names in a matrix
names(List3)
Echo the content of a vector in a matrix
List3$V1
Pair vectors from two matrices
TwoColumns <- paste(Record$V1,Delay$V1)
Set a default matrix
attach(List3)
Plot vectors
plot(x,sample(x)) # plot the vector against a randomized version of itself
plot(x,sample(x)^20,xlab="x",ylab="y",main="This is my first Rtwork")
plot(y=Delay$V1,x=Record$V1)
Plot the data
plot(soc$N, (soc$N * soc$Students),
ylab="Cumulative enrollment over the last four quarters",
xlab="Number of classes taught",
col.axis = "sky blue",
col.lab = "grey45",
pch = 19, # plotting character -- 19 is a circle
cex = 1, # or cex 1:10/5 if you want increasing sizes
col = green, # If you want green circles for data
# type = "n", # If you don't want to plot points yet
xlim = c(0,7), # Set the x-axis scale
ylim = c(0,550)) # It may be possible to use
Add a title, using legend colors green and blue
mtext(c("Ladder faculty teaching in ", "Comm", " and ", "Soc"),
cex = 1.5, # scale text size
col = c(1,4,1,3), # 0 white, 1 black, 2 red, 3 green, 4 blue
at = c(2,4.4,5.27,5.9), # clumsy solution -- must be manually adjusted
line = 2)
Add a subtitle
mtext(c("2008-2009 undergraduate classes, excluding sections and fiat lux"), line = 0.7)
Add labeled data points (offset left and right)
text ( (soc$N - 0.3), (soc$N * soc$Students), labels = soc$Name, col = "green" )
text ( (com$N + 0.3), (com$N * com$Students), labels = com$Name, col = "blue" )
Add points to an existing plot
points(soc$N, (soc$N * soc$Students))
Get yesterday's date
day <- Sys.Date() - 1
Format the date
day <- format(day, "%d %B %Y")
Add the formatted date to the title
title <- bquote(bold(paste("Postprocessing times on ", .(day))))
Admittedly a terrible syntax for adding a variable to a string
var <- bquote(paste("string", .(var)))
Plot a vector without x-axis labels and add the title string and color
plot(Delay$V1, xaxt="n", xlab="Time of recording", ylab="Delay in minutes",
main=title, col="blue")
Add the times into the x-axis lables
axis(1, at=1:(length(Delay$V1)), labels=Record$V1, tick=FALSE, col="red")
Cumulative word count with area graph
plot(cumsum(WordCount$V2), xaxt="n", xlab="Dates", ylab="Words", main=title,
pch = 19, cex = -1 , col = "green", type="h")
Histogram
hist(V1,ylab="Frequency",xlab="Delay in minutes", main=title)
Write
x <- matrix(1:10,ncol=5)
write(t(x)) -- writes x to a file called data (default file name)
write(x, "", sep = "\t") -- writes x tab-delimited to the screen (not to file)
write(x, "myfile", sep = "\t") -- writes x tab-delimited to the file myfile
MatchIt
maps
tag cloud
access Amazon, NYT, Google Trends
flash output
audio search
Reference
Tutorials
Blogs
Many R components have been packaged for Debian, starting with r-base-core, but packages can also be downloaded directly from within R itself, and install into /usr/local/lib/R/site-library. Cran download site USA (CA 2) is http://cran.stat.ucla.edu.
Debian packages
Types of R packages in Debian:
r-base*
r-block*
r-cran*
r-doc*
r-mathlib
r-other*
Recommended for installation:
r-recommended
animation
install.packages("animation")
ggplot2
install.packages("ggplot2")