Very basic R

Install R on Ubuntu (link)

sudo apt-get install r-base

other dependencies

From R:

install.packages('devtools')

If there are errors, sudo apt-get install <name debian package>

and try

Install RStudio on Ubuntu (link)

sudo apt-get install gdebi-core

wget https://download2.rstudio.org/server/xenial/amd64/rstudio-server-1.3.1093-amd64.deb sudo gdebi rstudio-server-1.3.1093-amd64.deb

1) R code for basic data munging

# code in R for basic data munging

# inspired by Jeff Leek lecture

# basics of data munging in R

# https://www.youtube.com/watch?v=8DGBOh6hJaE

prismData <- read.csv('combined_hlty_prism_shotgun.csv')

names(prismData)

# names have dot, now split based on dot

splitNames = strsplit(names(prismData),'\\.')

splitNames[[5]]

splitNames[[5]][1]

# apply (map) this function

# lambda x -> x[1] applied to splitNames

# function(x){x[1]}

firstElement <- function(x){x[1]}

sapply(splitNames,firstElement)

head(prismData,3)

# substitute names

tempData <- sapply(splitNames,firstElement)

gsub("_","",tempData)

sub("_","",tempData)

# example to show merge

riskData <- read.csv('combined_hlty_prism_shotgun_MOD.csv')

names(riskData)

head(riskData,3)

merge(prismData,riskData,by.x='cohort',by.y='cohort',all=TRUE)

# only those that match

merge(prismData,riskData,by.x='cohort',by.y='cohort')

# sort values

prismData$cohort

sort(prismData$cohort)

sort(prismData$cohort)[1:10]

# sort data frame

prismData$cohort[1:10]

sort(prismData$cohort[1:10])

2) Install and load packages

install.packages('smatr')

library('smatr')

# Check if package installed, if not then install it (from link)

if(!require("zoo"))

{install.packages("zoo")}

# Install locally

library("highr", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.2")

3) Creating data frames

log10crime <- read.csv('log10crime.csv')

log10comm_population <- read.csv('log10comm_population.csv')

# assign to data frame

df = data.frame(log10crime,log10comm_population)

4) Extract first few (head) of data frame

head(df)

> log10crime log10comm

1 2.2355 4.0785

2 2.6828 4.3640

# Note: log10crime and log10commare headers

5) Reference column of data frame by column name

df["log10crime"]

6) Number of rows in data frame

nrow(df)

7) Formula in R (note: sma is a package in SMATR package; load using library(smatr) )

sma(formula='log10crime~log10comm', data=df)

8) Good IDE (RStudio)

9) Help on something

?factor

10) List of numbers

period <- 2003:2012

11) Array index

dat = t(harborSealWA) # transpose

years = dat[1,] # 1st row of dat contains years

12) Transpose (from MARSS package)

dat = t(harborSealWA)

13) Access arrays (from MARSS package)

years = dat[1,] # 1st row of dat contains years

n = nrow(dat)-1

dat = dat[2:nrow(dat),] # data (log counts), without years

14) Concatenate and print (from MARSS package)

cat("Model 1 AIC:",kem1$AIC,"\n") #show the AIC

15) Show estimated model coefficients

coef(kem1)

16) Number of columns and rows in array

library(MARSS)

dat = t(harborSealWA)

n = ncol(dat)

m = nrow(dat)

17) Plotting in R (ggplot2). Good tutorial here and here

# use ggplot2

library(ggplot2)

18) Rename columns

Q_GDP_expenditure_approach <- read.csv("~/Box Sync/machine_learning_workup/forecasting/Code/Q_GDP_expenditure_approach.csv", na.strings="..")

names(Q_GDP_expenditure_approach)[1] <- "Country"

19) Plot numeric data

plot(as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country == "Australia",]))

# Labels, titles, etc

plot.forecast(best_arima_forecast, xlab="Years", ylab="Port throughput (AUD, imports)", pch=18, col="blue",main = "Best fit ARIMA model")

20) Time series object

GDP_Australia <- ts(as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country == "Australia",2:ncol(Q_GDP_expenditure_approach)]),

start=c(1990, 1), frequency=4)

21) Plot multiple time series (use ts and ts.plot)

GDP_Australia <- ts(as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country ==

"Australia",2:ncol(Q_GDP_expenditure_approach)]), start=c(1990, 1), frequency=4)

GDP_Germany <- ts(as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country == "Germany",2:ncol(Q_GDP_expenditure_approach)]), start=c(1990, 1), frequency=4)

ts.plot(GDP_Australia, GDP_Germany,

gpars=list(xlab="year", ylab="GDP", lty=c(1:2)))

title("GDP of Australia and Germany (deseasoned)")

22) Function in R

# function to standardize time series according to MARSS guide

standardize_timeseries <- function(timeseries_vector)

{

y_bar = apply(t(timeseries_vector), 1, mean, na.rm=TRUE)

Sigma = sqrt(apply(t(timeseries_vector), 1, var, na.rm=TRUE))

stdized_GDP_Australia = (timeseries_vector - y_bar) * (1/Sigma)

return(stdized_GDP_Australia)

}

# call function

TEMP <- as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country == "Germany",])

stdized_GDP_Germany = standardize_timeseries(TEMP)

23) R code to perform SMA (semi-major axis regression) to test for allometric power-law (on bitbucket)

24) Load function or import function or import R script

source('~/Box Sync/machine_learning_workup/forecasting/Code/MARSS_example_1.R')

25) Concatenate file names and strings

data_path = "~/Documents/imports-exports-forecasting/socioeconomic_data_munging/"

Q_GDP_expenditure_approach <- read.csv(paste(data_path,"Q_GDP_expenditure_approach.csv",sep=""), na.strings="..")

26) Create an R package

package.skeleton()

27) Timeseries analysis using R (link)

28) Timeseries analysis using R (plotting, using scan function and plot.ts)

rain <- scan("http://robjhyndman.com/tsdldata/hurst/precip1.dat",skip=1)

rainseries <- ts(rain,start=c(1813))

plot.ts(rainseries)

29) if statement in R

if (b_constant_trend == TRUE) {

if (b_seasonality == FALSE) {

holtwinters_fit_model <- HoltWinters(timeseries_data_handle, beta = param_beta, gamma = param_gamma)

# beta = param_beta, gamma = param_gamma should all be FALSE

}

else {

cat("Error: cannot have constant trend and also seasonality")

b_error = 1

return (b_error)

}

# ALSO

if {

# something

}

} else {

load(file="imports_quarterly_timeseries_cleaned.RData")

}

# AND if else else if in R (from link)

if (x ==1){

print('same')

} else if (x > 1){

print('bigger')

} else {

print('smaller')

}

30) View command in R (view data frame in a window)

# dat is a data frame

View(dat)

31) Concatenate two or more vectors to create a matrix (also works for time series objects)

combined_stdized_GDP <- cbind(stdized_GDP_Australia, stdized_GDP_China, stdized_GDP_US,

stdized_GDP_Japan, stdized_GDP_Singapore, stdized_GDP_Germany)

32) Save and load data files RData (similar to mat files in MATLAB)

save(imports_quarterly_raw, file="imports_quarterly_raw.RData")

load(file="imports_quarterly_raw.RData")

# save all workspace objects

save.image(file = "all_objects.RData")

33) Load required package

# require(<package_name>)

require(forecast)

34) Return multiple arguments in R function (from stackoverflow)

foo <- 12

bar <- c("a", "b", "e")

newList <- list("integer" = foo, "names" = bar)

# pack all objects to be returned in a list

ret_list <- list("fit_model"=holtwinters_fit_model, "forecast_ts"=forecast_timeseries)

# access using $

35) Remove all workspace variables (similar to clear all in MATLAB)

rm(list = ls())

36) List all objects in current workspace

ls()

37) Time series object refer to one column

target_ts[,"Tot_Value_FOB_AUD"]

38) Shiny R for visualisation (link) (tutorial)

install.packages("shiny")

library(shiny)

runExample("01_hello")

runApp("my_app")

shiny::runApp()

# Another example from the documentation

# Only run these examples in interactive R sessions

if (interactive()) {

# A basic shiny app with a plotOutput

shinyApp(

ui = fluidPage(

sidebarLayout(

sidebarPanel(

actionButton("newplot", "New plot")

),

mainPanel(

plotOutput("plot")

)

),

server = function(input, output) {

output$plot <- renderPlot({

input$newplot

# Add a little noise to the cars data

cars2 <- cars + rnorm(nrow(cars))

plot(cars2)

})

}

)

39) R cheat sheets for data viz, data wrangling by R studio

40) Time series objects

library(timeSeries)

ts() # creates time series object

length(<time_series_object>)

start(<time_series_object>) # 1995

end(<time_series_object>) # 2014

window(<time_series_object>, start, end) # new time series object with new start and end

frequency(<time_series_object>)

ts.union # union of two different time series with different frequency and start and end dates

ts.intersect # intersection of two different time series with different frequency and start and end dates

interpNA(<time_series_object>, method=“linear”) # interpolate NA in time series with imputed values

<ts_object_1> - <ts_object_2> # can subtract two time series objects

removeNA(forecast_var_model$res) # remove NA in time series

train_start = start(target_ts)

train_end = c(2007,4) # 2014, 3rd Quarter

test_end = end(target_ts)

# split into train and test

target_ts_train = window(target_ts, start=train_start, end=train_end)

41) Create a matrix

matrix()

42) String comparison in R

any(grepl("error", (fit_arima_model))

# grepl for grep that returns logical TRUE or FALSE

str_temp = 'caribb'

grepl(pattern = '*bla*|*caribb*', x = str_temp)

regular expressions in grep (link)

# if try-error then TRUE else FALSE

# MORE ADVANCED

# use stringdist package

require(stringdist)

?amatch

43) try in R

fit_arima_model <- try( arima( timeseries_data_handle, order=c(p, d, q),

silent = TRUE )

# also like try catch (from stackoverflow)

retval_try =

try( ( <statement> ),

silent = TRUE

)

if ( inherits(retval_try, 'try-error') )

{

}

44) Socio-economic data from Quandl (link)

install.packages('Quandl')

library(Quandl)

data <- Quandl("FRED/GDP")

head(data)

45) Find minimum element of vector

which(x == min(x))

which(x == min(x, na.rm = TRUE)) # if remove NaN

temp_min_index = which(x == min(x, na.rm = TRUE))

x[temp_min_index]

Find index of matches using which

idx_to_remove <- which(d$PC2 > 0.55)[1]

46) Remove elements from vector (stack exchange)

# First approach

a <- sample (1 : 10)

remove <- c(1,2,3,5,17)

a %in% remove

temp_ind = a %in% remove

a[!temp_ind]

# Second approach

setdiff(a, remove)

47) For loop iterating through a sequence of values ( seq command) (stackoverflow)

for (i_temp_counter in seq(1,num_variables)) {

}

48) Time series in R (link)

49) Function to find number of times series has to be differenced (link)

ndiffs

and inverse of differencing (discrete integration) (link)

diffinv

50) Refer to column

mat[, 1:5]

# OR

mat[, "col_name"]

51) Plot multiple things on the same plot (stackexchange)

plot( x, y1, type="l", col="red" )

par(new=TRUE)

plot( x, y2, type="l", col="green" )

# OR

plot.ts(target_ts_test, xlab ="Years", ylab = "USA GDP",

main = "Normal scale forecast using VAR model",col = "blue", t="p", pch=19) # plot with dots etc

# OR

plot( x, y1, type="l", col="red" )# plot the actual test time series:

lines(target_ts_test, col='red')

# OR

par(mfrow=c(1,2)) # multi panel plot multi-panel figure

52) Get residuals of model fit

res_model <- residuals(model)

53) Coefficients of model fit

coef(var_model)

54) Histogram

hist(x[,"interp_extr_ts"], 20)

rug(x[,"interp_extr_ts"]) # show the data also

abline(v = 12, col = "magenta", lwd = 4) # a line through the hist

55) Linear model (specify model relationship in R)

lm(y ~ x)

# read file

colitis_score <- read.csv('mouse_colitis_score_extracted.csv')

# fields of data frame

col_score = colitis_score$score

metagene_score <- c( mean( c( sum(df_il23$d0_R1), sum(df_il23$d0_R2), sum(df_il23$d0_R3), sum(df_il23$d0_R4) ) ),

mean( c( sum(df_il23$d1_R1), sum(df_il23$d1_R2), sum(df_il23$d1_R3), sum(df_il23$d1_R4) ) ),

mean( c( sum(df_il23$d3_R1), sum(df_il23$d3_R2), sum(df_il23$d3_R3), sum(df_il23$d3_R4) ) ),

mean( c( sum(df_il23$d6_R1), sum(df_il23$d6_R2), sum(df_il23$d6_R3), sum(df_il23$d6_R4) ) ),

mean( c( sum(df_il23$d14_R1), sum(df_il23$d14_R2), sum(df_il23$d14_R3), sum(df_il23$d14_R4) ) ),

mean( c( sum(df_il23$d21_R1), sum(df_il23$d21_R2), sum(df_il23$d21_R3), sum(df_il23$d21_R4) ) )

)

# create data frame

df = data.frame(col_score,metagene_score)

library(ggplot2)

# linear model

s = lm(formula = 'col_score ~ metagene_score', df)

# summary statistics

summary(s)

# plot

plot(metagene_score, col_score, pch = 16, cex = 1.3, col = "blue",

main = "Metagene score vs. colitis score",

xlab = "Metagene score", ylab = "Colitis score")

# line

abline(lm(formula = 'col_score ~ metagene_score', df))

56) Check version of R

version

57) Academic citation for package

citation("shiny")

58) Online book on using R for forecasting (link)

59) Find out type of an R object

class(imports_quarterly_raw)

60) SQL commands in R (link)

library("sqldf")

# find top exports by commodity

output_df = sqldf("select sum(exports_quarterly_raw.Weight) as sum_weight, exports_quarterly_raw.code2, lookup_code.description from exports_quarterly_raw inner join lookup_code on exports_quarterly_raw.code2 = lookup_code.tariff_code group by exports_quarterly_raw.code2 order by sum_weight desc")

61) which command in R to find indices that match in array or data frame

lookup_code[which(lookup_code$tariff_code == unique_codes)]

62) Code to load data from tables, perform SQL queries and plot histograms

library("sqldf")

library(ggplot2)

df = sqldf("select sum(imports_quarterly_raw.Weight) as sum_weight from imports_quarterly_raw where imports_quarterly_raw.Code2 = 39 and imports_quarterly_raw.Port_of_Discharge in ('Boston) and imports_quarterly_raw.mode_of_transport = 'SEA' group by imports_quarterly_raw.Year, imports_quarterly_raw.Qtr")

df_ts = ts(df, start=c(start_year, start_qtr), frequency=4)

plot.ts(df_ts, main = "Imports: Plastics And Articles Thereof",col = "blue")

# histogram/distribution of plastics

qplot(df, data = df, geom = 'histogram')

63) Save plots in R

pdf(file = "imports_plastics_yearly.pdf")

plot.ts(df_ts, main = "Imports: Plastics And Articles Thereof (aggregated yearly)",col = "blue")

dev.off()

# eps format

postscript(file="heatmap.eps",horiz=TRUE,

onefile=FALSE,width=8.5,

height=11,paper=letter)

dev.off()

# for ggplots

ggsave(filename = 'temp.pdf', useDingbats=FALSE)

ggsave(filename = "boxplot_numpeptides_vs_age.eps", gp, device = "eps")#, useDingbats=FALSE)

64) Names of columns

names()

colnames()

rownames()

65) Other useful commands

unique(), sort(), order()

66) Apply function to to data along axis

apply(), lapply(), tapply()

67) Run UNIX command from R

system("gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=finished.pdf *.pdf")

68) concatenate strings and return in R

paste()

69) Get first few characters of a string in R

substr(df_desc[1,], 1, 8)

70) Assign names of columns in data frame

colnames(output_df) <- c("Year", "Qtr", "Country_origin")

71) Split based on character (strsplit)

# Say start_date is of format 2015-03-30 and you want to extract year

as.character(start_date)

strsplit(as.character(start_date), "-")

unlist(strsplit(as.character(start_date), "-"))

72) Convert to character

as.character()

73) unlist

74) Histogram in R

r_all = hist(output_df$sum_fob_amt_aud, main="Histogram for FOB value",

xlab="FOB Value (AUD)",

border="black",

col="blue", breaks = 100) # xbin = c(1,6)

# access all counts, frequencies and midpoints of bins

r_all$counts

r_all$mids

75) Convert weird data format into time series

# Say date is in format 17/2/09

as.Date(exports_customs_short$intended_export_date, "%d/%m/%y")

# converts to R Date format

# Now can feed into SQL (sqldf) format

df = sqldf("select sum(fob_amt_aud) as sum_fob_amt_aud, intended_export_date from exports_customs_short where transport_mode_type_code = 'S' and loading_port_code = 'AUSYD' and cargo_id_type_code = 'C' group by intended_export_date order by intended_export_date")

# this will sort by date

76) Using Jupyter with R (link)

Install miniconda

conda install -c r ipython-notebook r-irkernel

ipython notebook

77) More group by dates in R SQLDF (METHOD: to coerce some field to be date or numeric etc in sqldf)

# Recast datetime columns as R Date object (makes SQL queries easier)

imports_customs_short$search_arrival_date <- (as.Date(imports_customs_short$search_arrival_date, "%d/%m/%y"))

df= sqldf("select sum(gross_weight) as sum_gross_weight, search_arrival_date from imports_customs_short group by search_arrival_date order by search_arrival_date ")

78) Dataframes can have elements with different types but ts (time series) object can have only one type: Date/time

79) Create data frame

data.frame()

OR

final_list_peptides_combined = data.frame(peptide = "peptide", stringsAsFactors=FALSE)

# NOTE: stringsAsFactors=FALSE will prevent columns being used as factors (stackoverflow)

# Characters by default become factors or categorical variables

# Examples

# create data frame

df_bic_values = as.data.frame( cbind(as.numeric(mcl.model_1$bic), as.numeric(mcl.model$bic), as.numeric(mcl.model_3$bic)), stringsAsFactors=FALSE )

# rename columns

colnames(df_bic_values) <- c("BIC 1 normal distribution", "BIC 2 normal distributions", "BIC 3 normal distributions")

80) expand.grid()

81) Hash table in R

list(name = var)

82) Generating random numbers

sample(1:10, 100, replace=TRUE) # 10 samples between 1 and 10

83) Create empty data frame (from link)

date.frame() # see also link

84) Append to existing data frame (use rbind)

temp_df = round(temp_random_norm * df$sum_container_count[iCount] * i_percentage_export)

# append to data frame

demand_export_df = rbind(demand_export_df, t(temp_df))

85) Generate range of numbers with step 1

20:80

86) Write data frame to csv file

write.csv( demand_export_df, file = "demand_export_shortcustoms_60days.csv" )

write.csv( demand_export_df, file = "demand_export_shortcustoms_60days.csv" , row.names = FALSE, quote=FALSE) # if no row names and no quote around characters

87) For loop

# temp_code is an array

for (icol in 1:length(temp_code))

{

icol

}

for (temp in temp_code)

{

temp

}

88) Select rows from a data frame when a column matches some element in a list (like a SQL select * from where in ....) (link)

# Use %in%

# Select total sum of FOB value for all HS codes that match with selected UN code i.e. sum over all subcommodities

df_imports_sea_sydney_trim = df_imports_sea_sydney[ df_imports_sea_sydney$Code2 %in% list_hs_codes, ]

89) Stacked plots using ggplot2 (from stackoverflow)

# Create dummy data

set.seed(11)

df <- data.frame(a = rlnorm(30), b = 1:10, c = rep(LETTERS[1:3], each = 10))

library(ggplot2)

# geom_area function to fill in area

ggplot(df, aes(x = b, y = a, fill = c)) + geom_area(position = 'stack') # + ylab(str_ylabel) + xlab(str_xlabel) + ggtitle(str_title)

ggsave(filename = 'temp.pdf' , useDingbats=FALSE)

ggsave(filename = "boxplot_numpeptides_vs_age.eps", gp, device = "eps")#, useDingbats=FALSE)

90) List all variables in the workspace

ls()

91) Barplots in R

# X is some matrix with columns being separate bars in the barplot

X <- as.matrix(cbind(df_short_cont_trim_summed$totalsum_fob_amt_aud, df_short_noncont_trim_summed$totalsum_fob_amt_aud))

barplot(X,

names.arg = c("Containerized","Bulk"), ylab = "")

title(main = "Bar chart of Containerized vs. Bulk", font.main = 4)

92) Data munging (dplyr package)

dplyr

93) Concatenate vectors in R

vector_1 = c(1,2)

vector_2 = c(1,2)

c(vector_1, vector_2)

94) R code to test for allometric power law relationship (on bitbucket)

95) R code to perform forecasting and SQL like queries for a road accident forecasting and data exploration project (on bitbucket) (deployed on shinyapps)

Deployed web application to perform data exploration using SQL-like queries and perform machine learning analysis (on shinyapps)

96) R function for generating stacked plots using ggplot (on bitbucket)

97) R code to test for allometric power law relationship (on bitbucket)

98) feather - fast on disk format for data frames in R and python

99) Iterate over lists using apply, mapply (link1) (link2 with examples)

100) Call a function on each element of multiple lists

mapply

# can also be done on multiple cores using library('parallel')

# install.packages('parallel')

list_un_commodity_code = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21)

list_datasource = c(1)

list_piechart = c(1)

list_fob_weight = c(1)

# generate_save_all_plots is a function that takes 4 parameters (scalars)

mapply(generate_save_all_plots, list_un_commodity_code, list_datasource, list_piechart, list_fob_weight)

101) Datetime library (link)

library(lubridate)

year(as_datetime("2015-03-03"))

month(as_datetime("2015-03-03"))

lubridate::year(lubridate::today())

lubridate::make_date( year = lubridate::year(df_epic_labtests_withage$date_of_birth_approx),

month = lubridate::month(df_epic_labtests_withage$date_of_birth_approx),

day = lubridate::day(df_epic_labtests_withage$date_of_birth_approx)

)

102) Fit a gamma distribution to data

library(MASS)

x.gam <- rgamma(100, shape = 1, scale = 1/10)

fitdistr(x.gam, "gamma")

103) String replacement

# gsub, grep, grepl, regexpr

# replace commas, which cause problem with SQL sum()

gsub(",", "", df_temp$total_cval)

# replace every character after _ALL with none (using .*) (courtesy Maria Jose Gomez)

gsub('_ALL.*', '', temp_file)

104) Create list in R

foo <- 12

bar <- c("a", "b", "e")

newList <- list("integer" = foo, "names" = bar)

105) Function to check NaN, NA, Inf, - Inf (link)

is.finite()

106) assert() function for testing

install.packages('testit')

library(testit)

assert("one equals one", 1 == 1)

assert("one equals one", 1 == 2)

# check/assert if numeric using is.finite()

assert("check is numeric \n", is.finite(variable) )

# generic function for asset checking

assert_generic_function <- function(var_input, i_ignore)

{

###############################################################################

# Testing function

###############################################################################

# var_input: data frame etc

# i_ignore: if TRUE then ignore and if FALSE then assert

# Assert statement

if (i_ignore == FALSE)

{

assert("check is numeric \n", is.finite(var_input) )

}

107) List of very handy packages in R (list)

has lubridate, sqldf, plyr, stringr, qcc (for quality control)

108) Package that is good replacement for ggplot

ggraptR (link)

109) R package for rapid data mining and data exploration (like Weka) (courtesy Yuriy Tyshetskiy)

rattle

110) Dynamic reports in R (like ipython notebook)

knitr (link)

making multi-chapter reports with references in knitr (R markdown) (link)

R markdown notebooks and shiny R (link)

chunk options

with

```r { }

```

include = FALSE prevents code and results from appearing in the finished file. R Markdown still runs the code in the chunk, and the results can be used by other chunks.

echo = FALSE prevents code, but not the results from appearing in the finished file. This is a useful way to embed figures.

message = FALSE prevents messages that are generated by code from appearing in the finished file.

warning = FALSE prevents warnings that are generated by code from appearing in the finished.

fig.cap = "..." adds a caption to graphical results.

# Insert value of variable dynamically in report

`r toString(nreplicates)`

# table of values using kable

df_bic_values = as.data.frame( cbind(as.numeric(mcl.model_1$bic), as.numeric(mcl.model$bic), as.numeric(mcl.model_3$bic)), stringsAsFactors=FALSE )

colnames(df_bic_values) <- c("BIC 1 normal distribution", "BIC 2 normal distributions", "BIC 3 normal distributions")

knitr::kable( df_bic_values, caption = "Model selection." )

# Multi-panel plots in knitr (link)

```{r fig_sicseq_deseq2_metacluster2_3, fig.cap="Identification of genes", echo=FALSE}

library(png)

library(grid)

library(gridExtra)

img1 <- rasterGrob(as.raster(readPNG("figures/deseq2_inflam_cluster_scseq_metacluster2.png")), interpolate = FALSE)

img2 <- rasterGrob(as.raster(readPNG("figures/deseq2_cluster_scseq_metacluster3.png")), interpolate = FALSE)

grid.arrange(img1, img2, ncol = 2)

```

# run knitr and rmarkdown from command line

R

library(knitr)

library(rmarkdown)

knitr::knit('~/periphery_project/combined_analysis_v4.rmd')

link

rmarkdown::render("test.Rmd", "html_document")

Rscript -e 'library(rmarkdown); rmarkdown::render("/path/to/test.Rmd", "html_document")'

# A typical header of a R markdown file will look like

---

title: "Analysis and Writeup"

header-includes:

- \usepackage{placeins}

- \usepackage{float}

- \floatplacement{figure}{H}

output:

pdf_document:

fig_caption: yes

keep_tex: yes

latex_engine: xelatex

number_sections: yes

word_document: default

html_document:

df_print: paged

bibliography: Periphery_project.bib

urlcolor: blue

---

```{r include=FALSE}

knitr::opts_chunk$set(echo = TRUE)

knitr::opts_chunk$set(cache = TRUE)

knitr::opts_chunk$set(warning = FALSE)

knitr::opts_chunk$set(out.extra = '')

#knitr::opts_chunk$set(fig.pos = 'H')

```

\begin{centering}

\vspace{3 cm}

\Large

\normalsize

Soumya Banerjee, `r format(Sys.time(), "%b %d %Y")`

\vspace{3 cm}

\end{centering}

\setcounter{tocdepth}{2}

\tableofcontents

\newpage

```{r,include=FALSE}

library(knitr)

library(gridExtra)

library(rmarkdown)

# EQUATIONS in rmarkdown

$$ eGFR = eGFR_{0} + b_{before}*t_{before} $$

```

Italics in rmarkdown using *metafor*

Code can be rendered or shown in rmarkdown using

```

dsBaseClient::ds.summary(x='surv_object')

```

111) Bayesian structural time series in R

CausalImpact

112) C++ coding best practices and tips for using with R on Mac OS X (link)

113) Reading a csv file

file_strong_binders = read.csv('strong_binders.txt', sep = ' ', header = FALSE, stringsAsFactors=FALSE, na.strings="..") # ,strip.white = TRUE)

OR

df_monocyte_genes_IGP <- read.csv('IGP_SuppTable_MOD.csv',

sep = ',', header = TRUE,

stringsAsFactors=FALSE, na.strings="..") # ,strip.white = TRUE)

# quote="" # for dealing with ' etc in files

NOTE: stringsAsFactors=FALSE will prevent columns being used as factors (stackoverflow)

NOTE: quote="" # for dealing with ' etc in files

114) Find number of rows in a dataframe

nrow()

115) Debugging in R

function like matlab keyboard command is browser()

see link1 and link2

116) Loop through all files in directory (link)

files <- list.files(path=str_results_directory, pattern="*.csv", recursive=FALSE)

117) Adding elements to a list in a while loop (stackoverflow)

list_val = list()

i_temp_counter = 1

for (temp_lambda in c(0.01, 0.1, 0.15, 0.2, 0.3, 0.5, 0.6))

{

list_val[[i_temp_counter]] = temp_lambda

i_temp_counter = i_temp_counter + 1

}

# get all elements in a nice vector

c(as.numeric (list_val) )

118) Sample from an empirical distribution (non-parametric distribution)

sample(empirical_distr, N_samples, replace=TRUE)

119) Concatenate columns of a dataframe and create one column with concatenated data

# Uses with() and paste0()

copy_numbers_AIREWT_file$V6 = with(copy_numbers_AIREWT_file, paste0(copy_numbers_AIREWT_file$V1,copy_numbers_AIREWT_file$V2,copy_numbers_AIREWT_file$V3,copy_numbers_AIREWT_file$V4,copy_numbers_AIREWT_file$V5))

120) Replace non-zero elements of a matrix (stackoverflow)

# find all non-zero elements

i_index_notzero = which(all_genes_withzeros_LOG10 != 0, arr.ind = TRUE)

# replace non-zero elements with log10

all_genes_withzeros_LOG10[i_index_notzero] = log10(all_genes_withzeros_LOG10[i_index_notzero])

OR if needs to be replaced with a single value

all_genes_withzeros_LOG10[all_genes_withzeros_LOG10==0] = NA

OR get index of column and remove that column

idx_to_remove <- which(d$PC2 > 0.55)[1]

IMPORTANT: how to remove column (courtesy Maria Jose Gomez)

# remove that column with [,c(-254)] notation

df_file_str_filename_RNASeq_withpath_NORM_PCAREMOVE = df_file_str_filename_RNASeq_withpath_NORM[,c(-idx_to_remove)]

# use grep to column position and then remove it

i_col_to_remove = grep("Comment_IBD_in_family", colnames(df_filename_scseq_infl_cluster))

df_filename_scseq_infl_cluster = df_filename_scseq_infl_cluster[,c(-i_col_to_remove)]

IMPORTANT: how to remove column (courtesy Stephen Sansom)

rownames(all_gene_matched_withprobeid_osm_agg) <- all_gene_matched_withprobeid_osm_agg$ENSEMBL

all_gene_matched_withprobeid_osm_agg$ENSEMBL <- NULL

121) Turn off scientific notation in R (no 1e-3 only 0.003)

options(scipen = 100)

122) append to a file and add a data frame

# write.table with append=TRUE

# will not work with write.csv

write.table(df_final_metadata, file=filename_metadata_FINAL,

row.names = FALSE, quote=FALSE, append = FALSE, sep = ",") #, col.names = NA)

123) continue statement in R

dir.create("./data")

125) Download files from an URL and save it in a directory (courtesy Elsa Arcaute)

data_url <- “XX”

dir.create("./data")

dest_file="data/temp.zip"

downloader::download(data_url,destfile=dest_file, mode = "wb")

126) Unzip file

unzip(dest_file,exdir="data")

127) Report generation using knitr with caching of results

128) Updating R, R Studio and updating packages (link)

# run from R console

install.packages('devtools') #assuming it is not already installed

library(devtools)

install_github('andreacirilloac/updateR')

library(updateR)

updateR(admin_password = 'Admin user password')

# the update R Studio if required

# update.packages() from R console

129) Heatmaps in R (using the ComplexHeatmap package)

library(ComplexHeatmap)

library(circlize)

set.seed(123)

# mat is generic matrix

mat = cbind(rbind(matrix(rnorm(16, -1), 4), matrix(rnorm(32, 1), 8)),

rbind(matrix(rnorm(24, 1), 4), matrix(rnorm(48, -1), 8)))

# permute the rows and columns

mat = mat[sample(nrow(mat), nrow(mat)), sample(ncol(mat), ncol(mat))]

rownames(mat) = paste0("R", 1:12)

colnames(mat) = paste0("C", 1:10)

Heatmap(mat)

# Also heatmaps using the heatmap.2 package (link)

# Also machine learning chapter in Irizarry ( book)

# Excerpt here

library(RColorBrewer)

hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)

Now, pick the genes with the top variance over all samples:

library(genefilter)

rv <- rowVars(e)

idx <- order(-rv)[1:40]

While a heatmap function is included in R, we recommend the heatmap.2 function from the gplots

package on CRAN because it is a bit more customized. For example, it stretches to fill the window.

Here we add colors to indicate the tissue on the top:

library(gplots) ##Available from CRAN

cols <- palette(brewer.pal(8, "Dark2"))[as.fumeric(tissue)]

head(cbind(colnames(e),cols))

heatmap.2(e[idx,], labCol=tissue,

trace="none",

ColSideColors=cols,

col=hmcol)

# save heatmap.2

png(filename = "heatmap.png")

heatmap.2(m)

dev.off()

130) Get column and row sums

rowSums()

colSums()

131) ggplot with labels

# TRICK CONCEPT: add a label to the dataframe with condition (stim)

library(ggplot2)

# ggplot magic to plot with labels

gp <- ggplot(d$stim, aes(d$PC1, d$PC2, color=stim)) + geom_point(size=5)

print(gp)

132) How to append a string to the end of each entry of a column (using paste0)

file_all_target_mappings_orig$SAMPLE <- paste0(file_all_target_mappings_orig$SAMPLE,".CEL")

133) rownames() and colnames() to get names of rows and columns

134) METHOD: add a column in the dataframe for stimulation condition

d$stim = sqldf("select file_all_target_mappings_orig_LPS_LPSaIL10.STIMULATION as stim

from file_all_target_mappings_orig_LPS_LPSaIL10

inner join d

on file_all_target_mappings_orig_LPS_LPSaIL10.SAMPLE = d.sample")

135) sorting and ordering an array on a column (using order() )

order(x[,1]) # sort on first column

idx = order(x[,1]) # get indices of sorted array

x[idx,] # feed these indices into array to get sorted

# all together (succinct)

x[order(x[,1]), ]

136) Summary statistics of an object

summary(as.vector(mat))

137) Filtering out rows of an array

idx2 <- apply(rawdata, 1, max) > 10 # only take in rows that have max > 10

rawdata <- rawdata[idx2, ] # use this to index on the array

138) table() and is.na() to summarize data

# how many are NA ?

is.na( select(x=hta20transcriptcluster.db, keys=c(rownames(all_genes_pca_loading_sorted)), columns = c("GENENAME") ) )

table( is.na( select(x=hta20transcriptcluster.db, keys=c(rownames(all_genes_pca_loading_sorted)), columns = c("GENENAME") ) ) )

139) METHOD: turn column names into a field

# add a column for sample name

# d is a dataframe

d$sample = rownames(d)

140) Data wrangling using dplyr (cheatsheet)

141) R markdown notebooks and shiny R (link)

chunk options

with

```r { }

```

include = FALSE prevents code and results from appearing in the finished file. R Markdown still runs the code in the chunk, and the results can be used by other chunks.

echo = FALSE prevents code, but not the results from appearing in the finished file. This is a useful way to embed figures.

message = FALSE prevents messages that are generated by code from appearing in the finished file.

warning = FALSE prevents warnings that are generated by code from appearing in the finished.

fig.cap = "..." adds a caption to graphical results.