Red Hen's Edge and Edge2 Search Engines and her command-line tools (peck and family) produce output in comma-separated files designed to be imported straight into R.
A syllabus for learning to use the statistical software package R on Red Hen data is available at https://docs.google.com/document/d/1tP3j3ftw_d_iqdpLNnl5M53AhaHFcdz0QpHR4BvprP4/edit
https://catalog.data.gov/dataset/animal-care-and-control-adopted-animalshttps://www.kaggle.com/uciml/irisCollaboration resources
Packages and extensions
Development framework
Tutorials
Start, help, and end
R # start R in Linux (installed on Roma)?keyword # get help on a function (guessing is ok)?plot(type) # get help on a parameter within a functionq() # quitInstalled by root on cartago (available to all)
> install.packages("ggplot2")> install.packages("histogram")> install.packages("tm") -- the text mining library> install.packages("stm") -- Estimation of the Structural Topic Model (topic modeling)?read.csv # helpSet working directory (can also be set in /etc/R/Rprofile.site)
> setwd("/home/csa/Pattern2.6")Read a csv file
df <- read.csv(inFile, header=TRUE, comment.char="#", stringsAsFactors=FALSE)> eb<-read.csv("/Users/steen/tna/csv/Erin_Burnett.csv",header=TRUE)> ebpositive = eb$Positive> ebsubjective = eb$Subjective> ng<-read.csv("/Users/steen/tna/csv/Nancy_Grace.csv",header=TRUE)> ngpositive = ng$Positive> ngsubjective = ng$Subjective> List3 <- read.table("file://tmp/list3", sep=" ", header=TRUE) # space-separated table> heat <-read.csv("/Users/steen/tna/csv/2014-07-22_Heatmap-sentiment.csv",header=TRUE)> SMT1Positive = heat$SMT_01_positivity> library(ggplot2)> library(scales)> heatdates <- as.Date(eb$Date)In little r (from Dirk Edelbuettel)
$ r -e'dat <- read.csv("/tmp/david.csv"); \ res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11])); print(res)' -Remove objects
> rm(days,mydates,sdate3)select variables v1, v2, v3
myvars <- c("v1", "v2", "v3") newdata <- mydata[myvars]another method
myvars <- paste("v", 1:3, sep="") newdata <- mydata[myvars]select 1st and 5th through 10th variables
newdata <- mydata[c(1,5:10)] Select a column
> heat <- read.csv("/Users/steen/tna/csv/2014-07-22_Heatmap-sentiment.csv", header = TRUE, as.is = TRUE)> heat20040603 <- subset(heat,Date <= "2004-06-03") # Works if you use as.is = TRUE> FaF <- subset(heat,Show = "Fox_and_Friends") # Does not work> library(sqldf)> FaF <- read.csv.sql(heat, sql = 'select * from file where Show = "Fox_and_Friends"') # FailsNumerical ranges
newdata <- subset(mydata, age >= 20 | age < 10, select=c(ID, Weight))Select rows by value range and range of columns by order
newdata <- subset(mydata, sex=="m" & age > 25, select=weight:income)CNN <- subset(heat, Network=="CNN", select=Date:SMT_01_positivity)cnnebofhm<- subset(hm, Network=="CNN" & Show=="Erin_Burnett_Out_Front")Getting information on a dataset
Show objects
> ls()Show variables / headers
> names(heat) [1] "Date" "Hour" "Country" [4] "Network" "Show" "Caption_wc" [7] "SMT_01_wc" "SMT_01_positivity" "SMT_01_subjectivity"[10] "SMT_02_wc" "SMT_02_positivity" "SMT_02_subjectivity"Show third column name
> colnames(heat)[3]Data structure (useful for spotting inconsistencies in the data)
> str(heat)'data.frame': 230893 obs. of 12 variables:$ Date : Date, format: "2004-06-01" "2004-06-02" ...$ Hour : int 1300 1300 1300 1300 1300 1300 1300 1300 1300 1300 ...$ Country : Factor w/ 13 levels "AF","BE","CA",..: 13 13 13 13 13 13 13 13 13 13 ...$ Network : Factor w/ 39 levels "AlJazeera","BBC",..: 39 39 39 39 39 39 39 39 39 39 ...$ Show : Factor w/ 1873 levels "1600_Pennsylvania_Avenue",..: 442 442 442 442 442 442 442 ...$ Caption_wc : int 5336 6946 8908 5366 7538 6582 7410 6261 6778 4122 ...$ SMT_01_wc : int 331 467 636 354 532 427 430 448 474 249 ...$ SMT_01_positivity : num 24.4 10.5 49.5 34.3 35.2 ...$ SMT_01_subjectivity: num 138 202 273 174 219 ...$ SMT_02_wc : int 1019 1364 1778 1032 1529 1291 1384 1237 1318 733 ...$ SMT_02_positivity : num 13 -26.38 8.38 -1.25 2 ...$ SMT_02_subjectivity: num 185 266 358 222 276 ...Print first column
> heat[1]Print second and fourth column
> heat[c(2,4)]Print first 10 rows
> head(heat, n=10)Print last 10 rows of second and fourth column
> tail(heat[c(2,4)], n=10)(today <- Sys.Date())format(today, "%d %b %Y") # with month as a word(tenweeks <- seq(today, length.out=10, by="1 week")) # next ten weeksweekdays(today)months(tenweeks)Convert a date column to dates
heat$Date <- as.Date(heat$Date , "%Y-%m-%d")Show first date
> head(heat[1], n=1) Date1 2004-06-01Show last date
> tail(heat[1], n=1) Date230893 2014-07-20Show days between two dates
mydates <- as.Date(c("2007-06-22", "2004-02-13"))days <- mydates[1] - mydates[2]Rename a column
> colnames(heat)[3] <- "Network"For loops
for (x in c(1:10)) print(sqrt(x))for (x in c(0:400)) print(x^2))History
history()savehistory(file = "name")loadhistory(file = "name")On the fly from a file
dat <- read.csv("smt2csv-02.py-output.csv", header=FALSE)res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11]))On the fly from stdin, using little r
cat smt2csv-02.py-output.csv | r -e'dat <- read.csv(file("stdin")); \ res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11])); print(res)' -Shortened version from 2014-07-26 update:
cat smt2csv-02.py-output.csv |\ r -d -e'\ res <- data.frame(mn1=mean(dat[,10]),sd1=sd(dat[,10]),mn2=mean(dat[,11]),sd2=sd(dat[,11])); print(res)' -Individually
> mean(ebpositive)[1] 0.03487137> var(ebpositive)[1] 0.1058562> sd(ebpositive)[1] 0.3253556> mean(ebsubjective)[1] 0.2791462> var(ebsubjective)[1] 0.0977887> sd(ebsubjective)[1] 0.3127119> mean(ngpositive)[1] 0.02013443> var(ngpositive)[1] 0.1135885> sd(ngpositive)[1] 0.3370289> mean(ngsubjective)[1] 0.2875757> var(ngsubjective)[1] 0.09846148> sd(ngsubjective)[1] 0.3137857Boxplot
> boxplot(eb$Positive, ng$Positive, col="lightblue")> boxplot(eb$Subjective, ng$Subjective, col="red")Histogram
> library(histogram)> histogram(eb$Positive)Rscript
Rscript is included in R and is the default method of scripting. You can run Rscripts from the commandline or from a bash script.
littler
Little r is an alternative and more flexible scripting system for R written by Dirk Edelbuettel. With r, you can do things like read piped output; you can also run r commands straight inside bash scripts.
Obsolete style -- run a script silently or verbosely
R --vanilla --slave <delay.R # or use --save to keep historyR --vanilla --verbose <faculty-teaching.R # get some debugging informationDefine a vector and display the contents
x <- c(0:401)xGenerate a vector with cumulative values from a vector or a matrix
Cumulative <- cumsum(x)Cumulative <- cumsum(WordCount$V2)Find vector names in a matrix
names(List3)Echo the content of a vector in a matrix
List3$V1Pair vectors from two matrices
TwoColumns <- paste(Record$V1,Delay$V1)Set a default matrix
attach(List3)Plot vectors
plot(x,sample(x)) # plot the vector against a randomized version of itselfplot(x,sample(x)^20,xlab="x",ylab="y",main="This is my first Rtwork")plot(y=Delay$V1,x=Record$V1)Plot the data
plot(soc$N, (soc$N * soc$Students), ylab="Cumulative enrollment over the last four quarters", xlab="Number of classes taught", col.axis = "sky blue", col.lab = "grey45", pch = 19, # plotting character -- 19 is a circle cex = 1, # or cex 1:10/5 if you want increasing sizes col = green, # If you want green circles for data # type = "n", # If you don't want to plot points yet xlim = c(0,7), # Set the x-axis scale ylim = c(0,550)) # It may be possible to use Add a title, using legend colors green and blue
mtext(c("Ladder faculty teaching in ", "Comm", " and ", "Soc"), cex = 1.5, # scale text size col = c(1,4,1,3), # 0 white, 1 black, 2 red, 3 green, 4 blue at = c(2,4.4,5.27,5.9), # clumsy solution -- must be manually adjusted line = 2)Add a subtitle
mtext(c("2008-2009 undergraduate classes, excluding sections and fiat lux"), line = 0.7)Add labeled data points (offset left and right)
text ( (soc$N - 0.3), (soc$N * soc$Students), labels = soc$Name, col = "green" )text ( (com$N + 0.3), (com$N * com$Students), labels = com$Name, col = "blue" )Add points to an existing plot
points(soc$N, (soc$N * soc$Students))Get yesterday's date
day <- Sys.Date() - 1Format the date
day <- format(day, "%d %B %Y")Add the formatted date to the title
title <- bquote(bold(paste("Postprocessing times on ", .(day))))Admittedly a terrible syntax for adding a variable to a string
var <- bquote(paste("string", .(var)))Plot a vector without x-axis labels and add the title string and color
plot(Delay$V1, xaxt="n", xlab="Time of recording", ylab="Delay in minutes",main=title, col="blue")Add the times into the x-axis lables
axis(1, at=1:(length(Delay$V1)), labels=Record$V1, tick=FALSE, col="red")Cumulative word count with area graph
plot(cumsum(WordCount$V2), xaxt="n", xlab="Dates", ylab="Words", main=title,pch = 19, cex = -1 , col = "green", type="h")Histogram
hist(V1,ylab="Frequency",xlab="Delay in minutes", main=title)Write
x <- matrix(1:10,ncol=5)write(t(x)) -- writes x to a file called data (default file name)write(x, "", sep = "\t") -- writes x tab-delimited to the screen (not to file)write(x, "myfile", sep = "\t") -- writes x tab-delimited to the file myfileMatchIt
maps
tag cloud
access Amazon, NYT, Google Trends
flash output
audio search
Reference
Tutorials
Blogs
Many R components have been packaged for Debian, starting with r-base-core, but packages can also be downloaded directly from within R itself, and install into /usr/local/lib/R/site-library. Cran download site USA (CA 2) is http://cran.stat.ucla.edu.
Debian packages
Types of R packages in Debian:
r-base*r-block*r-cran*r-doc*r-mathlibr-other*Recommended for installation:
r-recommendedanimation
install.packages("animation")ggplot2
install.packages("ggplot2")