Very basic R
Install R on Ubuntu (link)
sudo apt-get install r-base
other dependencies
From R:
install.packages('devtools')
If there are errors, sudo apt-get install <name debian package>
and try
Install RStudio on Ubuntu (link)
sudo apt-get install gdebi-core
wget https://download2.rstudio.org/server/xenial/amd64/rstudio-server-1.3.1093-amd64.deb sudo gdebi rstudio-server-1.3.1093-amd64.deb
1) R code for basic data munging
# code in R for basic data munging
# inspired by Jeff Leek lecture
# basics of data munging in R
# https://www.youtube.com/watch?v=8DGBOh6hJaE
prismData <- read.csv('combined_hlty_prism_shotgun.csv')
names(prismData)
# names have dot, now split based on dot
splitNames = strsplit(names(prismData),'\\.')
splitNames[[5]]
splitNames[[5]][1]
# apply (map) this function
# lambda x -> x[1] applied to splitNames
# function(x){x[1]}
firstElement <- function(x){x[1]}
sapply(splitNames,firstElement)
head(prismData,3)
# substitute names
tempData <- sapply(splitNames,firstElement)
gsub("_","",tempData)
sub("_","",tempData)
# example to show merge
riskData <- read.csv('combined_hlty_prism_shotgun_MOD.csv')
names(riskData)
head(riskData,3)
merge(prismData,riskData,by.x='cohort',by.y='cohort',all=TRUE)
# only those that match
merge(prismData,riskData,by.x='cohort',by.y='cohort')
# sort values
prismData$cohort
sort(prismData$cohort)
sort(prismData$cohort)[1:10]
# sort data frame
prismData$cohort[1:10]
sort(prismData$cohort[1:10])
2) Install and load packages
install.packages('smatr')
library('smatr')
# Check if package installed, if not then install it (from link)
if(!require("zoo"))
{install.packages("zoo")}
# Install locally
library("highr", lib.loc="~/R/x86_64-pc-linux-gnu-library/3.2")
3) Creating data frames
log10crime <- read.csv('log10crime.csv')
log10comm_population <- read.csv('log10comm_population.csv')
# assign to data frame
df = data.frame(log10crime,log10comm_population)
4) Extract first few (head) of data frame
head(df)
> log10crime log10comm
1 2.2355 4.0785
2 2.6828 4.3640
# Note: log10crime and log10commare headers
5) Reference column of data frame by column name
df["log10crime"]
6) Number of rows in data frame
nrow(df)
7) Formula in R (note: sma is a package in SMATR package; load using library(smatr) )
sma(formula='log10crime~log10comm', data=df)
8) Good IDE (RStudio)
9) Help on something
?factor
10) List of numbers
period <- 2003:2012
11) Array index
dat = t(harborSealWA) # transpose
years = dat[1,] # 1st row of dat contains years
12) Transpose (from MARSS package)
dat = t(harborSealWA)
13) Access arrays (from MARSS package)
years = dat[1,] # 1st row of dat contains years
n = nrow(dat)-1
dat = dat[2:nrow(dat),] # data (log counts), without years
14) Concatenate and print (from MARSS package)
cat("Model 1 AIC:",kem1$AIC,"\n") #show the AIC
15) Show estimated model coefficients
coef(kem1)
16) Number of columns and rows in array
library(MARSS)
dat = t(harborSealWA)
n = ncol(dat)
m = nrow(dat)
17) Plotting in R (ggplot2). Good tutorial here and here
# use ggplot2
library(ggplot2)
18) Rename columns
Q_GDP_expenditure_approach <- read.csv("~/Box Sync/machine_learning_workup/forecasting/Code/Q_GDP_expenditure_approach.csv", na.strings="..")
names(Q_GDP_expenditure_approach)[1] <- "Country"
19) Plot numeric data
plot(as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country == "Australia",]))
# Labels, titles, etc
plot.forecast(best_arima_forecast, xlab="Years", ylab="Port throughput (AUD, imports)", pch=18, col="blue",main = "Best fit ARIMA model")
20) Time series object
GDP_Australia <- ts(as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country == "Australia",2:ncol(Q_GDP_expenditure_approach)]),
start=c(1990, 1), frequency=4)
21) Plot multiple time series (use ts and ts.plot)
GDP_Australia <- ts(as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country ==
"Australia",2:ncol(Q_GDP_expenditure_approach)]), start=c(1990, 1), frequency=4)
GDP_Germany <- ts(as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country == "Germany",2:ncol(Q_GDP_expenditure_approach)]), start=c(1990, 1), frequency=4)
ts.plot(GDP_Australia, GDP_Germany,
gpars=list(xlab="year", ylab="GDP", lty=c(1:2)))
title("GDP of Australia and Germany (deseasoned)")
22) Function in R
# function to standardize time series according to MARSS guide
standardize_timeseries <- function(timeseries_vector)
{
y_bar = apply(t(timeseries_vector), 1, mean, na.rm=TRUE)
Sigma = sqrt(apply(t(timeseries_vector), 1, var, na.rm=TRUE))
stdized_GDP_Australia = (timeseries_vector - y_bar) * (1/Sigma)
return(stdized_GDP_Australia)
}
# call function
TEMP <- as.numeric(Q_GDP_expenditure_approach[Q_GDP_expenditure_approach$Country == "Germany",])
stdized_GDP_Germany = standardize_timeseries(TEMP)
23) R code to perform SMA (semi-major axis regression) to test for allometric power-law (on bitbucket)
24) Load function or import function or import R script
source('~/Box Sync/machine_learning_workup/forecasting/Code/MARSS_example_1.R')
25) Concatenate file names and strings
data_path = "~/Documents/imports-exports-forecasting/socioeconomic_data_munging/"
Q_GDP_expenditure_approach <- read.csv(paste(data_path,"Q_GDP_expenditure_approach.csv",sep=""), na.strings="..")
26) Create an R package
package.skeleton()
27) Timeseries analysis using R (link)
28) Timeseries analysis using R (plotting, using scan function and plot.ts)
rain <- scan("http://robjhyndman.com/tsdldata/hurst/precip1.dat",skip=1)
rainseries <- ts(rain,start=c(1813))
plot.ts(rainseries)
29) if statement in R
if (b_constant_trend == TRUE) {
if (b_seasonality == FALSE) {
holtwinters_fit_model <- HoltWinters(timeseries_data_handle, beta = param_beta, gamma = param_gamma)
# beta = param_beta, gamma = param_gamma should all be FALSE
}
else {
cat("Error: cannot have constant trend and also seasonality")
b_error = 1
return (b_error)
}
# ALSO
if {
# something
}
} else {
load(file="imports_quarterly_timeseries_cleaned.RData")
}
# AND if else else if in R (from link)
if (x ==1){
print('same')
} else if (x > 1){
print('bigger')
} else {
print('smaller')
}
30) View command in R (view data frame in a window)
# dat is a data frame
View(dat)
31) Concatenate two or more vectors to create a matrix (also works for time series objects)
combined_stdized_GDP <- cbind(stdized_GDP_Australia, stdized_GDP_China, stdized_GDP_US,
stdized_GDP_Japan, stdized_GDP_Singapore, stdized_GDP_Germany)
32) Save and load data files RData (similar to mat files in MATLAB)
save(imports_quarterly_raw, file="imports_quarterly_raw.RData")
load(file="imports_quarterly_raw.RData")
# save all workspace objects
save.image(file = "all_objects.RData")
33) Load required package
# require(<package_name>)
require(forecast)
34) Return multiple arguments in R function (from stackoverflow)
foo <- 12
bar <- c("a", "b", "e")
newList <- list("integer" = foo, "names" = bar)
# pack all objects to be returned in a list
ret_list <- list("fit_model"=holtwinters_fit_model, "forecast_ts"=forecast_timeseries)
# access using $
35) Remove all workspace variables (similar to clear all in MATLAB)
rm(list = ls())
36) List all objects in current workspace
ls()
37) Time series object refer to one column
target_ts[,"Tot_Value_FOB_AUD"]
38) Shiny R for visualisation (link) (tutorial)
install.packages("shiny")
library(shiny)
runExample("01_hello")
runApp("my_app")
shiny::runApp()
# Another example from the documentation
# Only run these examples in interactive R sessions
if (interactive()) {
# A basic shiny app with a plotOutput
shinyApp(
ui = fluidPage(
sidebarLayout(
sidebarPanel(
actionButton("newplot", "New plot")
),
mainPanel(
plotOutput("plot")
)
)
),
server = function(input, output) {
output$plot <- renderPlot({
input$newplot
# Add a little noise to the cars data
cars2 <- cars + rnorm(nrow(cars))
plot(cars2)
})
}
)
39) R cheat sheets for data viz, data wrangling by R studio
40) Time series objects
library(timeSeries)
ts() # creates time series object
length(<time_series_object>)
start(<time_series_object>) # 1995
end(<time_series_object>) # 2014
window(<time_series_object>, start, end) # new time series object with new start and end
frequency(<time_series_object>)
ts.union # union of two different time series with different frequency and start and end dates
ts.intersect # intersection of two different time series with different frequency and start and end dates
interpNA(<time_series_object>, method=“linear”) # interpolate NA in time series with imputed values
<ts_object_1> - <ts_object_2> # can subtract two time series objects
removeNA(forecast_var_model$res) # remove NA in time series
train_start = start(target_ts)
train_end = c(2007,4) # 2014, 3rd Quarter
test_end = end(target_ts)
# split into train and test
target_ts_train = window(target_ts, start=train_start, end=train_end)
41) Create a matrix
matrix()
42) String comparison in R
any(grepl("error", (fit_arima_model))
# grepl for grep that returns logical TRUE or FALSE
str_temp = 'caribb'
grepl(pattern = '*bla*|*caribb*', x = str_temp)
regular expressions in grep (link)
# if try-error then TRUE else FALSE
# MORE ADVANCED
# use stringdist package
require(stringdist)
?amatch
43) try in R
fit_arima_model <- try( arima( timeseries_data_handle, order=c(p, d, q),
silent = TRUE )
# also like try catch (from stackoverflow)
retval_try =
try( ( <statement> ),
silent = TRUE
)
if ( inherits(retval_try, 'try-error') )
{
<error handling statement>
}
44) Socio-economic data from Quandl (link)
install.packages('Quandl')
library(Quandl)
data <- Quandl("FRED/GDP")
head(data)
45) Find minimum element of vector
which(x == min(x))
which(x == min(x, na.rm = TRUE)) # if remove NaN
temp_min_index = which(x == min(x, na.rm = TRUE))
x[temp_min_index]
Find index of matches using which
idx_to_remove <- which(d$PC2 > 0.55)[1]
46) Remove elements from vector (stack exchange)
# First approach
a <- sample (1 : 10)
remove <- c(1,2,3,5,17)
a %in% remove
temp_ind = a %in% remove
a[!temp_ind]
# Second approach
setdiff(a, remove)
47) For loop iterating through a sequence of values ( seq command) (stackoverflow)
for (i_temp_counter in seq(1,num_variables)) {
}
48) Time series in R (link)
49) Function to find number of times series has to be differenced (link)
ndiffs
and inverse of differencing (discrete integration) (link)
diffinv
50) Refer to column
mat[, 1:5]
# OR
mat[, "col_name"]
51) Plot multiple things on the same plot (stackexchange)
plot( x, y1, type="l", col="red" )
par(new=TRUE)
plot( x, y2, type="l", col="green" )
# OR
plot.ts(target_ts_test, xlab ="Years", ylab = "USA GDP",
main = "Normal scale forecast using VAR model",col = "blue", t="p", pch=19) # plot with dots etc
# OR
plot( x, y1, type="l", col="red" )# plot the actual test time series:
lines(target_ts_test, col='red')
# OR
par(mfrow=c(1,2)) # multi panel plot multi-panel figure
52) Get residuals of model fit
res_model <- residuals(model)
53) Coefficients of model fit
coef(var_model)
54) Histogram
hist(x[,"interp_extr_ts"], 20)
rug(x[,"interp_extr_ts"]) # show the data also
abline(v = 12, col = "magenta", lwd = 4) # a line through the hist
55) Linear model (specify model relationship in R)
lm(y ~ x)
# read file
colitis_score <- read.csv('mouse_colitis_score_extracted.csv')
# fields of data frame
col_score = colitis_score$score
metagene_score <- c( mean( c( sum(df_il23$d0_R1), sum(df_il23$d0_R2), sum(df_il23$d0_R3), sum(df_il23$d0_R4) ) ),
mean( c( sum(df_il23$d1_R1), sum(df_il23$d1_R2), sum(df_il23$d1_R3), sum(df_il23$d1_R4) ) ),
mean( c( sum(df_il23$d3_R1), sum(df_il23$d3_R2), sum(df_il23$d3_R3), sum(df_il23$d3_R4) ) ),
mean( c( sum(df_il23$d6_R1), sum(df_il23$d6_R2), sum(df_il23$d6_R3), sum(df_il23$d6_R4) ) ),
mean( c( sum(df_il23$d14_R1), sum(df_il23$d14_R2), sum(df_il23$d14_R3), sum(df_il23$d14_R4) ) ),
mean( c( sum(df_il23$d21_R1), sum(df_il23$d21_R2), sum(df_il23$d21_R3), sum(df_il23$d21_R4) ) )
)
# create data frame
df = data.frame(col_score,metagene_score)
library(ggplot2)
# linear model
s = lm(formula = 'col_score ~ metagene_score', df)
# summary statistics
summary(s)
# plot
plot(metagene_score, col_score, pch = 16, cex = 1.3, col = "blue",
main = "Metagene score vs. colitis score",
xlab = "Metagene score", ylab = "Colitis score")
# line
abline(lm(formula = 'col_score ~ metagene_score', df))
56) Check version of R
version
57) Academic citation for package
citation("shiny")
58) Online book on using R for forecasting (link)
59) Find out type of an R object
class(imports_quarterly_raw)
60) SQL commands in R (link)
library("sqldf")
# find top exports by commodity
output_df = sqldf("select sum(exports_quarterly_raw.Weight) as sum_weight, exports_quarterly_raw.code2, lookup_code.description from exports_quarterly_raw inner join lookup_code on exports_quarterly_raw.code2 = lookup_code.tariff_code group by exports_quarterly_raw.code2 order by sum_weight desc")
61) which command in R to find indices that match in array or data frame
lookup_code[which(lookup_code$tariff_code == unique_codes)]
62) Code to load data from tables, perform SQL queries and plot histograms
library("sqldf")
library(ggplot2)
df = sqldf("select sum(imports_quarterly_raw.Weight) as sum_weight from imports_quarterly_raw where imports_quarterly_raw.Code2 = 39 and imports_quarterly_raw.Port_of_Discharge in ('Boston) and imports_quarterly_raw.mode_of_transport = 'SEA' group by imports_quarterly_raw.Year, imports_quarterly_raw.Qtr")
df_ts = ts(df, start=c(start_year, start_qtr), frequency=4)
plot.ts(df_ts, main = "Imports: Plastics And Articles Thereof",col = "blue")
# histogram/distribution of plastics
qplot(df, data = df, geom = 'histogram')
63) Save plots in R
pdf(file = "imports_plastics_yearly.pdf")
plot.ts(df_ts, main = "Imports: Plastics And Articles Thereof (aggregated yearly)",col = "blue")
dev.off()
# eps format
postscript(file="heatmap.eps",horiz=TRUE,
onefile=FALSE,width=8.5,
height=11,paper=letter)
dev.off()
# for ggplots
ggsave(filename = 'temp.pdf', useDingbats=FALSE)
ggsave(filename = "boxplot_numpeptides_vs_age.eps", gp, device = "eps")#, useDingbats=FALSE)
64) Names of columns
names()
colnames()
rownames()
65) Other useful commands
unique(), sort(), order()
66) Apply function to to data along axis
apply(), lapply(), tapply()
67) Run UNIX command from R
system("gs -dBATCH -dNOPAUSE -q -sDEVICE=pdfwrite -sOutputFile=finished.pdf *.pdf")
68) concatenate strings and return in R
paste()
69) Get first few characters of a string in R
substr(df_desc[1,], 1, 8)
70) Assign names of columns in data frame
colnames(output_df) <- c("Year", "Qtr", "Country_origin")
71) Split based on character (strsplit)
# Say start_date is of format 2015-03-30 and you want to extract year
as.character(start_date)
strsplit(as.character(start_date), "-")
unlist(strsplit(as.character(start_date), "-"))
72) Convert to character
as.character()
73) unlist
74) Histogram in R
r_all = hist(output_df$sum_fob_amt_aud, main="Histogram for FOB value",
xlab="FOB Value (AUD)",
border="black",
col="blue", breaks = 100) # xbin = c(1,6)
# access all counts, frequencies and midpoints of bins
r_all$counts
r_all$mids
75) Convert weird data format into time series
# Say date is in format 17/2/09
as.Date(exports_customs_short$intended_export_date, "%d/%m/%y")
# converts to R Date format
# Now can feed into SQL (sqldf) format
df = sqldf("select sum(fob_amt_aud) as sum_fob_amt_aud, intended_export_date from exports_customs_short where transport_mode_type_code = 'S' and loading_port_code = 'AUSYD' and cargo_id_type_code = 'C' group by intended_export_date order by intended_export_date")
# this will sort by date
76) Using Jupyter with R (link)
Install miniconda
conda install -c r ipython-notebook r-irkernel
ipython notebook
77) More group by dates in R SQLDF (METHOD: to coerce some field to be date or numeric etc in sqldf)
# Recast datetime columns as R Date object (makes SQL queries easier)
imports_customs_short$search_arrival_date <- (as.Date(imports_customs_short$search_arrival_date, "%d/%m/%y"))
df= sqldf("select sum(gross_weight) as sum_gross_weight, search_arrival_date from imports_customs_short group by search_arrival_date order by search_arrival_date ")
78) Dataframes can have elements with different types but ts (time series) object can have only one type: Date/time
79) Create data frame
data.frame()
OR
final_list_peptides_combined = data.frame(peptide = "peptide", stringsAsFactors=FALSE)
# NOTE: stringsAsFactors=FALSE will prevent columns being used as factors (stackoverflow)
# Characters by default become factors or categorical variables
# Examples
# create data frame
df_bic_values = as.data.frame( cbind(as.numeric(mcl.model_1$bic), as.numeric(mcl.model$bic), as.numeric(mcl.model_3$bic)), stringsAsFactors=FALSE )
# rename columns
colnames(df_bic_values) <- c("BIC 1 normal distribution", "BIC 2 normal distributions", "BIC 3 normal distributions")
80) expand.grid()
81) Hash table in R
list(name = var)
82) Generating random numbers
sample(1:10, 100, replace=TRUE) # 10 samples between 1 and 10
83) Create empty data frame (from link)
date.frame() # see also link
84) Append to existing data frame (use rbind)
temp_df = round(temp_random_norm * df$sum_container_count[iCount] * i_percentage_export)
# append to data frame
demand_export_df = rbind(demand_export_df, t(temp_df))
85) Generate range of numbers with step 1
20:80
86) Write data frame to csv file
write.csv( demand_export_df, file = "demand_export_shortcustoms_60days.csv" )
write.csv( demand_export_df, file = "demand_export_shortcustoms_60days.csv" , row.names = FALSE, quote=FALSE) # if no row names and no quote around characters
87) For loop
# temp_code is an array
for (icol in 1:length(temp_code))
{
icol
}
for (temp in temp_code)
{
temp
}
88) Select rows from a data frame when a column matches some element in a list (like a SQL select * from where in ....) (link)
# Use %in%
# Select total sum of FOB value for all HS codes that match with selected UN code i.e. sum over all subcommodities
df_imports_sea_sydney_trim = df_imports_sea_sydney[ df_imports_sea_sydney$Code2 %in% list_hs_codes, ]
89) Stacked plots using ggplot2 (from stackoverflow)
# Create dummy data
set.seed(11)
df <- data.frame(a = rlnorm(30), b = 1:10, c = rep(LETTERS[1:3], each = 10))
library(ggplot2)
# geom_area function to fill in area
ggplot(df, aes(x = b, y = a, fill = c)) + geom_area(position = 'stack') # + ylab(str_ylabel) + xlab(str_xlabel) + ggtitle(str_title)
ggsave(filename = 'temp.pdf' , useDingbats=FALSE)
ggsave(filename = "boxplot_numpeptides_vs_age.eps", gp, device = "eps")#, useDingbats=FALSE)
90) List all variables in the workspace
ls()
91) Barplots in R
# X is some matrix with columns being separate bars in the barplot
X <- as.matrix(cbind(df_short_cont_trim_summed$totalsum_fob_amt_aud, df_short_noncont_trim_summed$totalsum_fob_amt_aud))
barplot(X,
names.arg = c("Containerized","Bulk"), ylab = "")
title(main = "Bar chart of Containerized vs. Bulk", font.main = 4)
92) Data munging (dplyr package)
dplyr
93) Concatenate vectors in R
vector_1 = c(1,2)
vector_2 = c(1,2)
c(vector_1, vector_2)
94) R code to test for allometric power law relationship (on bitbucket)
95) R code to perform forecasting and SQL like queries for a road accident forecasting and data exploration project (on bitbucket) (deployed on shinyapps)
Deployed web application to perform data exploration using SQL-like queries and perform machine learning analysis (on shinyapps)
96) R function for generating stacked plots using ggplot (on bitbucket)
97) R code to test for allometric power law relationship (on bitbucket)
98) feather - fast on disk format for data frames in R and python
99) Iterate over lists using apply, mapply (link1) (link2 with examples)
100) Call a function on each element of multiple lists
mapply
# can also be done on multiple cores using library('parallel')
# install.packages('parallel')
list_un_commodity_code = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 20, 21)
list_datasource = c(1)
list_piechart = c(1)
list_fob_weight = c(1)
# generate_save_all_plots is a function that takes 4 parameters (scalars)
mapply(generate_save_all_plots, list_un_commodity_code, list_datasource, list_piechart, list_fob_weight)
101) Datetime library (link)
library(lubridate)
year(as_datetime("2015-03-03"))
month(as_datetime("2015-03-03"))
lubridate::year(lubridate::today())
lubridate::make_date( year = lubridate::year(df_epic_labtests_withage$date_of_birth_approx),
month = lubridate::month(df_epic_labtests_withage$date_of_birth_approx),
day = lubridate::day(df_epic_labtests_withage$date_of_birth_approx)
)
102) Fit a gamma distribution to data
library(MASS)
x.gam <- rgamma(100, shape = 1, scale = 1/10)
fitdistr(x.gam, "gamma")
103) String replacement
# gsub, grep, grepl, regexpr
# replace commas, which cause problem with SQL sum()
gsub(",", "", df_temp$total_cval)
# replace every character after _ALL with none (using .*) (courtesy Maria Jose Gomez)
gsub('_ALL.*', '', temp_file)
104) Create list in R
foo <- 12
bar <- c("a", "b", "e")
newList <- list("integer" = foo, "names" = bar)
105) Function to check NaN, NA, Inf, - Inf (link)
is.finite()
106) assert() function for testing
install.packages('testit')
library(testit)
assert("one equals one", 1 == 1)
assert("one equals one", 1 == 2)
# check/assert if numeric using is.finite()
assert("check is numeric \n", is.finite(variable) )
# generic function for asset checking
assert_generic_function <- function(var_input, i_ignore)
{
###############################################################################
# Testing function
###############################################################################
# var_input: data frame etc
# i_ignore: if TRUE then ignore and if FALSE then assert
# Assert statement
if (i_ignore == FALSE)
{
assert("check is numeric \n", is.finite(var_input) )
}
}
107) List of very handy packages in R (list)
has lubridate, sqldf, plyr, stringr, qcc (for quality control)
108) Package that is good replacement for ggplot
ggraptR (link)
109) R package for rapid data mining and data exploration (like Weka) (courtesy Yuriy Tyshetskiy)
rattle
110) Dynamic reports in R (like ipython notebook)
knitr (link)
making multi-chapter reports with references in knitr (R markdown) (link)
R markdown notebooks and shiny R (link)
chunk options
with
```r { }
```
include = FALSE prevents code and results from appearing in the finished file. R Markdown still runs the code in the chunk, and the results can be used by other chunks.
echo = FALSE prevents code, but not the results from appearing in the finished file. This is a useful way to embed figures.
message = FALSE prevents messages that are generated by code from appearing in the finished file.
warning = FALSE prevents warnings that are generated by code from appearing in the finished.
fig.cap = "..." adds a caption to graphical results.
# Insert value of variable dynamically in report
`r toString(nreplicates)`
# table of values using kable
df_bic_values = as.data.frame( cbind(as.numeric(mcl.model_1$bic), as.numeric(mcl.model$bic), as.numeric(mcl.model_3$bic)), stringsAsFactors=FALSE )
colnames(df_bic_values) <- c("BIC 1 normal distribution", "BIC 2 normal distributions", "BIC 3 normal distributions")
knitr::kable( df_bic_values, caption = "Model selection." )
# Multi-panel plots in knitr (link)
```{r fig_sicseq_deseq2_metacluster2_3, fig.cap="Identification of genes", echo=FALSE}
library(png)
library(grid)
library(gridExtra)
img1 <- rasterGrob(as.raster(readPNG("figures/deseq2_inflam_cluster_scseq_metacluster2.png")), interpolate = FALSE)
img2 <- rasterGrob(as.raster(readPNG("figures/deseq2_cluster_scseq_metacluster3.png")), interpolate = FALSE)
grid.arrange(img1, img2, ncol = 2)
```
# run knitr and rmarkdown from command line
R
library(knitr)
library(rmarkdown)
knitr::knit('~/periphery_project/combined_analysis_v4.rmd')
rmarkdown::render("test.Rmd", "html_document")
Rscript -e 'library(rmarkdown); rmarkdown::render("/path/to/test.Rmd", "html_document")'
# A typical header of a R markdown file will look like
---
title: "Analysis and Writeup"
header-includes:
- \usepackage{placeins}
- \usepackage{float}
- \floatplacement{figure}{H}
output:
pdf_document:
fig_caption: yes
keep_tex: yes
latex_engine: xelatex
number_sections: yes
word_document: default
html_document:
df_print: paged
bibliography: Periphery_project.bib
urlcolor: blue
---
```{r include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(cache = TRUE)
knitr::opts_chunk$set(warning = FALSE)
knitr::opts_chunk$set(out.extra = '')
#knitr::opts_chunk$set(fig.pos = 'H')
```
\begin{centering}
\vspace{3 cm}
\Large
\normalsize
Soumya Banerjee, `r format(Sys.time(), "%b %d %Y")`
\vspace{3 cm}
\end{centering}
\setcounter{tocdepth}{2}
\tableofcontents
\newpage
```{r,include=FALSE}
library(knitr)
library(gridExtra)
library(rmarkdown)
# EQUATIONS in rmarkdown
$$ eGFR = eGFR_{0} + b_{before}*t_{before} $$
```
Italics in rmarkdown using *metafor*
Code can be rendered or shown in rmarkdown using
```
dsBaseClient::ds.summary(x='surv_object')
```
111) Bayesian structural time series in R
CausalImpact
112) C++ coding best practices and tips for using with R on Mac OS X (link)
113) Reading a csv file
file_strong_binders = read.csv('strong_binders.txt', sep = ' ', header = FALSE, stringsAsFactors=FALSE, na.strings="..") # ,strip.white = TRUE)
OR
df_monocyte_genes_IGP <- read.csv('IGP_SuppTable_MOD.csv',
sep = ',', header = TRUE,
stringsAsFactors=FALSE, na.strings="..") # ,strip.white = TRUE)
# quote="" # for dealing with ' etc in files
NOTE: stringsAsFactors=FALSE will prevent columns being used as factors (stackoverflow)
NOTE: quote="" # for dealing with ' etc in files
114) Find number of rows in a dataframe
nrow()
115) Debugging in R
function like matlab keyboard command is browser()
116) Loop through all files in directory (link)
files <- list.files(path=str_results_directory, pattern="*.csv", recursive=FALSE)
117) Adding elements to a list in a while loop (stackoverflow)
list_val = list()
i_temp_counter = 1
for (temp_lambda in c(0.01, 0.1, 0.15, 0.2, 0.3, 0.5, 0.6))
{
list_val[[i_temp_counter]] = temp_lambda
i_temp_counter = i_temp_counter + 1
}
# get all elements in a nice vector
c(as.numeric (list_val) )
118) Sample from an empirical distribution (non-parametric distribution)
sample(empirical_distr, N_samples, replace=TRUE)
119) Concatenate columns of a dataframe and create one column with concatenated data
# Uses with() and paste0()
copy_numbers_AIREWT_file$V6 = with(copy_numbers_AIREWT_file, paste0(copy_numbers_AIREWT_file$V1,copy_numbers_AIREWT_file$V2,copy_numbers_AIREWT_file$V3,copy_numbers_AIREWT_file$V4,copy_numbers_AIREWT_file$V5))
120) Replace non-zero elements of a matrix (stackoverflow)
# find all non-zero elements
i_index_notzero = which(all_genes_withzeros_LOG10 != 0, arr.ind = TRUE)
# replace non-zero elements with log10
all_genes_withzeros_LOG10[i_index_notzero] = log10(all_genes_withzeros_LOG10[i_index_notzero])
OR if needs to be replaced with a single value
all_genes_withzeros_LOG10[all_genes_withzeros_LOG10==0] = NA
OR get index of column and remove that column
idx_to_remove <- which(d$PC2 > 0.55)[1]
IMPORTANT: how to remove column (courtesy Maria Jose Gomez)
# remove that column with [,c(-254)] notation
df_file_str_filename_RNASeq_withpath_NORM_PCAREMOVE = df_file_str_filename_RNASeq_withpath_NORM[,c(-idx_to_remove)]
# use grep to column position and then remove it
i_col_to_remove = grep("Comment_IBD_in_family", colnames(df_filename_scseq_infl_cluster))
df_filename_scseq_infl_cluster = df_filename_scseq_infl_cluster[,c(-i_col_to_remove)]
IMPORTANT: how to remove column (courtesy Stephen Sansom)
rownames(all_gene_matched_withprobeid_osm_agg) <- all_gene_matched_withprobeid_osm_agg$ENSEMBL
all_gene_matched_withprobeid_osm_agg$ENSEMBL <- NULL
121) Turn off scientific notation in R (no 1e-3 only 0.003)
options(scipen = 100)
122) append to a file and add a data frame
# write.table with append=TRUE
# will not work with write.csv
write.table(df_final_metadata, file=filename_metadata_FINAL,
row.names = FALSE, quote=FALSE, append = FALSE, sep = ",") #, col.names = NA)
123) continue statement in R
next
124) create directory in R
dir.create("./data")
125) Download files from an URL and save it in a directory (courtesy Elsa Arcaute)
data_url <- “XX”
dir.create("./data")
dest_file="data/temp.zip"
downloader::download(data_url,destfile=dest_file, mode = "wb")
126) Unzip file
unzip(dest_file,exdir="data")
127) Report generation using knitr with caching of results
128) Updating R, R Studio and updating packages (link)
# run from R console
install.packages('devtools') #assuming it is not already installed
library(devtools)
install_github('andreacirilloac/updateR')
library(updateR)
updateR(admin_password = 'Admin user password')
# the update R Studio if required
# update.packages() from R console
129) Heatmaps in R (using the ComplexHeatmap package)
library(ComplexHeatmap)
library(circlize)
set.seed(123)
# mat is generic matrix
mat = cbind(rbind(matrix(rnorm(16, -1), 4), matrix(rnorm(32, 1), 8)),
rbind(matrix(rnorm(24, 1), 4), matrix(rnorm(48, -1), 8)))
# permute the rows and columns
mat = mat[sample(nrow(mat), nrow(mat)), sample(ncol(mat), ncol(mat))]
rownames(mat) = paste0("R", 1:12)
colnames(mat) = paste0("C", 1:10)
Heatmap(mat)
# Also heatmaps using the heatmap.2 package (link)
# Also machine learning chapter in Irizarry ( book)
# Excerpt here
library(RColorBrewer)
hmcol <- colorRampPalette(brewer.pal(9, "GnBu"))(100)
Now, pick the genes with the top variance over all samples:
library(genefilter)
rv <- rowVars(e)
idx <- order(-rv)[1:40]
While a heatmap function is included in R, we recommend the heatmap.2 function from the gplots
package on CRAN because it is a bit more customized. For example, it stretches to fill the window.
Here we add colors to indicate the tissue on the top:
library(gplots) ##Available from CRAN
cols <- palette(brewer.pal(8, "Dark2"))[as.fumeric(tissue)]
head(cbind(colnames(e),cols))
heatmap.2(e[idx,], labCol=tissue,
trace="none",
ColSideColors=cols,
col=hmcol)
# save heatmap.2
png(filename = "heatmap.png")
heatmap.2(m)
dev.off()
130) Get column and row sums
rowSums()
colSums()
131) ggplot with labels
# TRICK CONCEPT: add a label to the dataframe with condition (stim)
library(ggplot2)
# ggplot magic to plot with labels
gp <- ggplot(d$stim, aes(d$PC1, d$PC2, color=stim)) + geom_point(size=5)
print(gp)
132) How to append a string to the end of each entry of a column (using paste0)
file_all_target_mappings_orig$SAMPLE <- paste0(file_all_target_mappings_orig$SAMPLE,".CEL")
133) rownames() and colnames() to get names of rows and columns
134) METHOD: add a column in the dataframe for stimulation condition
d$stim = sqldf("select file_all_target_mappings_orig_LPS_LPSaIL10.STIMULATION as stim
from file_all_target_mappings_orig_LPS_LPSaIL10
inner join d
on file_all_target_mappings_orig_LPS_LPSaIL10.SAMPLE = d.sample")
135) sorting and ordering an array on a column (using order() )
order(x[,1]) # sort on first column
idx = order(x[,1]) # get indices of sorted array
x[idx,] # feed these indices into array to get sorted
# all together (succinct)
x[order(x[,1]), ]
136) Summary statistics of an object
summary(as.vector(mat))
137) Filtering out rows of an array
idx2 <- apply(rawdata, 1, max) > 10 # only take in rows that have max > 10
rawdata <- rawdata[idx2, ] # use this to index on the array
138) table() and is.na() to summarize data
# how many are NA ?
is.na( select(x=hta20transcriptcluster.db, keys=c(rownames(all_genes_pca_loading_sorted)), columns = c("GENENAME") ) )
table( is.na( select(x=hta20transcriptcluster.db, keys=c(rownames(all_genes_pca_loading_sorted)), columns = c("GENENAME") ) ) )
139) METHOD: turn column names into a field
# add a column for sample name
# d is a dataframe
d$sample = rownames(d)
140) Data wrangling using dplyr (cheatsheet)
141) R markdown notebooks and shiny R (link)
chunk options
with
```r { }
```
include = FALSE prevents code and results from appearing in the finished file. R Markdown still runs the code in the chunk, and the results can be used by other chunks.
echo = FALSE prevents code, but not the results from appearing in the finished file. This is a useful way to embed figures.
message = FALSE prevents messages that are generated by code from appearing in the finished file.
warning = FALSE prevents warnings that are generated by code from appearing in the finished.
fig.cap = "..." adds a caption to graphical results.
MORE LINKS (link)
142) NICE COMMANDS
subset(), with()
143) qplot() in ggplot2
library(ggplot2)
library(datasets)
data(mpg)
qplot(displ, hwy, data=mpg)
144) Chi-squared test with contigency table (from link) (also see link)
M <- as.table(rbind(c(762, 327, 468), c(484, 239, 477)))
dimnames(M) <- list(gender = c("F", "M"),
party = c("Democrat","Independent", "Republican"))
View(M)
(Xsq <- chisq.test(M)) # Prints test summary
Xsq$observed # observed counts (same as M)
Xsq$expected # expected counts under the null
Xsq$residuals # Pearson residuals
Xsq$stdres # standardized residuals
Fisher's exact test (link)
fisher.test(M)
145) Convert data frame column (severe) to a factor with meaningful labels and then plot in ggplot TRICK
df_metagene_risk_score_final_joined_atNF_combined$severe = factor(df_metagene_risk_score_final_joined_atNF_combined$severe, c(0,1), c("Severe", "Not Severe"))
ggplot(df_metagene_risk_score_final_joined_atNF_combined, aes(x=metagene_score, colour = severe)) + geom_density()
146) ggplot multi-panel figure (courtesy Maria Jose Gomez Vazquez)
library(gridExtra)
gp1 <- qplot(metagene_score, data = df_metagene_risk_score_final_joined_atNF_combined, geom = 'histogram')
gp2 <- qplot(metagene_score, data = df_metagene_risk_score_final_joined_atNF_combined, geom = 'density')
grid.arrange(gp1, gp2, nrow=1)
147) R max DLL reached problem
.Renviron file in home directory
R_MAX_NUM_DLLS=250
148) ggplot theme
theme_set(theme_gray())
149) select annotations for microarrays
# http://bioconductor.org/packages/3.6/data/annotation/manuals/illuminaMousev2.db/man/illuminaMousev2.db.pdf
source("https://bioconductor.org/biocLite.R")
BiocInstaller::biocLite("illuminaMousev2.db")
library(illuminaMousev2.db)
select(x=illuminaMousev2.db, c('ILMN_1217075'), c("ENSEMBL","SYMBOL"))
# find all columns using
AnnotationDbi::colnames(hta20transcriptcluster())
150) transpose a data frame
#transpose data matrix
matrix_all_genes_risk_scseq_array_weighted = (t(as.matrix(all_genes_risk_scseq_array_weighted[,1:321])))
# make new data frame
df_matrix_all_genes_risk_scseq_array_weighted = as.data.frame(matrix_all_genes_risk_scseq_array_weighted,
stringsAsFactors = FALSE)
# get new column names and rownames
colnames(df_matrix_all_genes_risk_scseq_array_weighted) <- all_genes_risk_scseq_array_weighted$gene_name
df_matrix_all_genes_risk_scseq_array_weighted$id = rownames(matrix_all_genes_risk_scseq_array_weighted)
151) Get indices of columns that have only a particular pattern (courtesy Maria Jose Gomez)
# get indices of columns that have only BAMC id
temp_idx_columns_with_risk = grep("BAMC", colnames(all_genes_risk_scseq_array_weighted))
152) Wilcoxon test (link)
boxplot(Ozone ~ Month, data = airquality)
wilcox.test(Ozone ~ Month, data = airquality,
subset = Month %in% c(5, 8))
153) Use of %in% command to subset a data frame
# all IDs that are in another data frame
df_matrix_all_genes_risk_scseq_array_weighted_CONTROL =
df_matrix_all_genes_risk_scseq_array_weighted[df_matrix_all_genes_risk_scseq_array_weighted$id %in% df_risk_healthy$SampleID,]
# logical operation
# all IDs if in another data frame, set a new column to the logical result (TRUE/FALSE)
df_matrix_all_genes_risk_scseq_array_weighted$control = df_matrix_all_genes_risk_scseq_array_weighted$id %in% df_matrix_all_genes_risk_scseq_array_weighted_CONTROL$id
154) ggplot boxplot (link)
gp <- ggplot(df_num_metagene_score_NEW, aes(x=control,y=num_metagene_score_NEW, fill=control))
gp <- gp + geom_boxplot()
gp
155) reshape, cast and melt (link)
156) ggplot facet_wrap facet wrap (link)
gp <- ggplot(df_combined_score_data_frame, aes(x=control,y=norm_metagene,fill=control))
gp <- gp + geom_boxplot()
gp <- gp + facet_wrap( ~ signature, ncol=2, scales = "free_y")
gp <- gp + xlab("IBD vs. Control")
gp <- gp + ylab("Normalised metagene")
gp
# where control, norm_metagene and signature are columns of a data frame named df_combined_score_data_frame
157) Demonstration of how to load a data frame, melt it and plot it in ggplot using facet_wrap (courtesy Kevin Rue-Albrecht and Maria Jose Gomez)
t =
cbind(
sum(df_il23$d0_R1),
sum(df_il23$d0_R2),
sum(df_il23$d0_R3),
sum(df_il23$d0_R4)
)
t = rbind(t, cbind(
sum(df_il23$d1_R1),
sum(df_il23$d1_R2),
sum(df_il23$d1_R3),
sum(df_il23$d1_R4)
)
)
t = rbind(t, cbind(
sum(df_il23$d3_R1),
sum(df_il23$d3_R2),
sum(df_il23$d3_R3),
sum(df_il23$d3_R4)
)
)
t = rbind(t, cbind(
sum(df_il23$d6_R1),
sum(df_il23$d6_R2),
sum(df_il23$d6_R3),
sum(df_il23$d6_R4)
)
)
t = rbind(t, cbind(
sum(df_il23$d14_R1),
sum(df_il23$d14_R2),
sum(df_il23$d14_R3),
sum(df_il23$d14_R4)
)
)
t = rbind(t, cbind(
sum(df_il23$d21_R1),
sum(df_il23$d21_R2),
sum(df_il23$d21_R3),
sum(df_il23$d21_R4)
)
)
df_tt = data.frame(t, stringsAsFactors = FALSE)
df_tt$day = c("0","1","3","6","14","21")
df_tt$source = c("metacluster5","metacluster5","metacluster5","metacluster5","metacluster5","metacluster5")
df_tt_melted = melt(df_tt, id=c("day","source"))
# convert to numeric
df_tt_melted$day = as.numeric(df_tt_melted$day)
# make factors
df_tt_melted$source = factor(df_tt_melted$source)
df_tt_melted$day = factor(df_tt_melted$day)
gp <- ggplot(df_tt_melted, aes(x=day,y=value,fill=source))
gp <- gp + geom_point()
gp <- gp + xlab("Days") + ylab("Metagene score")
gp
theme_set(theme_gray())
gp <- ggplot(df_tt_melted, aes(x=day,y=value,color=source)) # fill=source
gp <- gp + geom_point()
gp <- gp + facet_wrap(~ source, ncol=2, scales = "free_y")
gp <- gp + xlab("Days") + ylab("Metagene score")
gp
158) Make a list unique OR rownames unique (courtesy Maria Jose Gomez)
mat_mod$gene_name <- make.unique(mat_mod$gene_name)
159) Assigning a Boolean value of an expression to a column (using %in%)
df_matrix_all_genes_risk_scseq_array_weighted$control = df_matrix_all_genes_risk_scseq_array_weighted$ccfaid %in% df_matrix_all_genes_risk_scseq_array_weighted_CONTROL$ccfaid
160) Other useful functions
merge, unique, subset, intersect, Reduce
intersect with multiple sets (link)
Reduce( intersect, list(a,b,c) )
161) ggplot point with line group by source (link) (courtesy Stephen Sansom)
theme_set(theme_gray())
gp <- ggplot(df_combined_mouse_metagene_withsource, aes(x=day,y=value,group=source) )
gp <- gp + geom_point(aes(color=source))
gp <- gp + geom_line(aes(color=source))
gp
162) Heatmap with ggplot (link)
geom_tile()
163) zscore
scale()
164) remove a row due to a row having all zeros or all NAs (link) (stackoverflow)
# use complete.cases()
zscored_values <- zscored_values[(complete.cases(zscored_values)),]
# remove a row due to a column having NA or Inf
# relies on the fact that Inf * 0 is NA and NA * 0 is NA (link)
complete.cases(c(1, Inf))
[1] TRUE TRUE
complete.cases(c(1, Inf*0))
[1] TRUE FALSE
complete.cases(c(1, Inf*0, NA))
[1] TRUE FALSE FALSE
complete.cases(c(1, Inf*0, NA*0))
[1] TRUE FALSE FALSE
# final code to remove a row due to a column having NA or Inf
dt_tests_with_age <- dt_tests_with_age[complete.cases(dt_tests_with_age$egfr_recalc * 0), ]
# ALTERNATIVE remove NA or remove a row with NA
na.omit(d2)
165) cut() command to subset a dataframe
166) Rename one column of a data frame
df_combined_score_data_frame[(df_combined_score_data_frame$signature == "metacluster5"),]$signature = "ribosomal"
167) Select those rows that have a particular characteristic and subset an array based on that
#keep those IDs that have medicine usage in active disease
# select IDs
temp_idx_to_keep = which( colnames(m_to_be_plotted_withbar_subset_active) %in% df_subset_subset$id )
# subset array
m_to_be_plotted_withbar_subset_active_SUBSET = m_to_be_plotted_withbar_subset_active[,temp_idx_to_keep]
168) ggplot histogram (courtesy Maria Jose Gomez Vazquez) (other examples of ggplot histogram from R for Data Science)
library(gridExtra)
gp1 <- qplot(metagene_score, data = df_metagene_risk_score_final_joined_atNF_combined, geom = 'histogram')
gp2 <- qplot(metagene_score, data = df_metagene_risk_score_final_joined_atNF_combined, geom = 'density')
grid.arrange(gp1, gp2, nrow=1)
OR (courtesy Shanquan Chen)
NOTE: use of position="identity". CAUTION: default is position="stacked" (link)
gp <- ggplot(df_temp_combined_ckd_grouped, aes(x=min_age_calculated_asof_test/365, fill=flag_responder_binary) )
gp <- gp + geom_histogram(bins=15, position = "identity", alpha=0.4) # stat = "count"
gp <- gp + xlab('Age of subject at time of Lithium cessation')
gp <- gp + ylab('Frequency')
gp
OR
# facet wrap and facet grid histogram
# where data frame df_late_inflamed_fc1_new_distribution_metagene_melted has score in field value and column named histology
theme_set(theme_gray())
gp2 <- ggplot(df_late_inflamed_fc1_new_distribution_metagene_melted, aes(x=value,fill=histology))
gp2 <- gp2 + geom_histogram()#(alpha=0.5)
gp2 <- gp2 + xlab("Metagene score") + ylab("Number of patients")
#gp <- gp + facet_wrap(histology~.)
gp2 <- gp2 + facet_grid(histology~.)
gp2
# facet wrap and facet grid
# geom_density with alpha makes it transparent
theme_set(theme_gray())
gp <- ggplot(df_late_inflamed_fc1_new_distribution_metagene_melted, aes(x=value,fill=histology))
gp <- gp + geom_density(alpha=0.5)
#gp <- gp + facet_wrap(histology~.)
gp <- gp + facet_grid(histology~.)
gp
print(gp)
# if you want to change the order in which variables are facetted TRICK (courtesy Kevin-Rue Albrecht)
df_combined_metagene_with_hist_melted_refactored = melt(df_combined_metagene_with_hist,
id=c("id","histology"))
df_combined_metagene_with_hist_melted_refactored$histology = factor(df_combined_metagene_with_hist_melted_refactored$histology,
c("macro", "micro", "control")
)
theme_set(theme_gray())
gp2 <- ggplot(df_combined_metagene_with_hist_melted_refactored, aes(x=value,fill=histology))
gp2 <- gp2 + geom_histogram()#(alpha=0.5)
gp2 <- gp2 + xlab("Metagene score") + ylab("Number of patients")
#gp2 <- gp2 + facet_wrap(histology~.)
gp2 <- gp2 + facet_grid(histology~.)
gp2
# if you want to relabel factors (e.g. if 1 = "General", 2 = "Academic")
df_counts_data$prog <- factor(df_counts_data$prog,
levels = c(1,2,3),
labels = c("General", "Academic", "Vocational")
)
169) Named list in R (like key value dict in Python)
mouse_late_early = c(Path1 = "#7570B3", Path2 = "#E7298A", Path3 = "#66A61E")
170) Run R from command line
nohup R --no-save < analysis_degeneracy_ageing.R
171) Colour schemes and palettes for R (link)
172) Get rid of factors in a data frame (courtesy Maria Jose Gomez)
# CONCEPT: factors are stored as numbers. They need to be converted to character first and then numbers
# the field active has become a factor; convert it into numeric
df_data$active = as.numeric(as.character(df_data$active))
173) remove rows in a matrix or dataframe with duplicated gene names (courtesy Kevin Rue-Albrecht)
# using duplicated function
all_genes = all_genes[!duplicated(all_genes$gene_name), ]
174) Excellent ggplot visualization website (link)
Another excellent ggplot visualization link (link)
175) dplyr ordering (courtesy Maria Jose Gomez)
y_comb <- y_comb %>% group_by(source) %>% arrange(source, desc(V1)) %>% as.data.frame()
176) geom_path ggplot with facet wrap
# use dplyr to sort data frame helps with geom_path
y_comb <- y_comb %>% group_by(source) %>% arrange(source, desc(V1)) %>% as.data.frame()
#############################
# Plot facet
#############################
gp <- ggplot(y_comb, aes(x=y_comb$V1, y=y_comb$V2))
gp <- gp + geom_path()
gp <- gp + facet_wrap(~source, ncol=2)
gp <- gp + geom_hline(data=y_comb, aes(yintercept = threshold), linetype="dashed")
gp <- gp + xlim(0,1)
gp <- gp + ylim(0,1)
gp <- gp + xlab("Recall")
gp <- gp + ylab("Precision")
gp
177) Violin plot with jitter and median lines shown
theme_set(theme_classic())
gp2 <- ggplot(df_combined_metagene_with_hist_melted_refactored, aes(y=value))
gp2 <- gp2 + geom_violin(aes(x = disease_category, y=value, fill=disease_category), draw_quantiles = c(0.5))
gp2 <- gp2 + geom_jitter(height = 0, width = 0.1, aes(x=disease_category, y=value))
gp2 <- gp2 + xlab("Disease category") + ylab("Metagene score")
# gp2 <- gp2 + xlim(20,100)
# gp2 <- gp2 + ylim(10,90)
#gp <- gp + facet_wrap(histology~.)
# gp2 <- gp2 + facet_grid(histology~.)
gp2
178) Volcano plot using ggplot
#######################################
# prettier MA plot using ggplot
#######################################
res_df$gene_name <- rownames(res_df)
data = res_df
data$color = "default"
# Invert sign of log fold change
data$log2FoldChange = -1 * data$log2FoldChange
data$log2_adjpvalue = -log2(data$padj)
# Horizontal lines for fold change
i_y_line_threshold_intercept_top = -log2(0.05)
i_y_line_threshold_intercept_bottom = -1
i_x_line_threshold_intercept_1 = log2(2)
i_x_line_threshold_intercept_2 = log2(1/2)
top = head(data, n=20) # by -pvalue
data$color[abs(data$log2FoldChange)> 1 & data$padj<0.05] <- "> 2 fold, padj < 0.05"
cols <- c("default"="darkgrey", "> 2 fold, padj < 0.05"="red")
# theme_set(theme_gray())
theme_set(theme_classic())
# gp <- ggplot(data, aes(baseMean, log2FoldChange, color=color)) + geom_point(alpha=0.8)
gp <- ggplot(data, aes(log2FoldChange, log2_adjpvalue, color=color)) + geom_point(alpha=0.8)
# gp <- gp + scale_x_continuous(trans="log2")
#gp <- gp + geom_text(data=top, aes(label=gene_name), vjust=1,hjust=-0.2, color="black")
gp <- gp + geom_text_repel(data=top, aes(label=gene_name), vjust=1, hjust=-0.2, color="black")
gp <- gp + geom_hline(yintercept=i_y_line_threshold_intercept_top, linetype="dashed")
gp <- gp + geom_vline(xintercept=i_x_line_threshold_intercept_1, linetype="dashed")
gp <- gp + geom_vline(xintercept=i_x_line_threshold_intercept_2, linetype="dashed")
#gp <- gp + geom_hline(yintercept=i_y_line_threshold_intercept_bottom, linetype="dashed")
gp <- gp + scale_color_manual(values=cols)
gp <- gp + ggtitle("DESeq2 analysis of monocytes across all donors within a metacluster")
# gp <- gp + xlab("Fold change (log2)expression (baseMean)") + ylab("Fold change (log2)")
gp <- gp + xlab("Log fold change") + ylab("-log (p-value)")
print(gp)
str_maplot_filename = paste0("volcanoplot_meta", i_meta_cluster_id, ".pdf")
ggsave(filename = str_maplot_filename, gp, device = "pdf", useDingbats=FALSE)
ggsave(filename = "boxplot_numpeptides_vs_age.eps", gp, device = "eps")#, useDingbats=FALSE)
179) Replace ALL missing values in a data frame with NA
idx_missing_values_in_cells = which(df_joined_cleaned_removed == '', arr.ind = TRUE)
df_joined_cleaned_removed[idx_missing_values_in_cells] = 'NA'
Find missing values (link)
which(!complete.cases(df_politeness))
180) grep to select those rows that do not have a pattern (courtesy Maria Jose Gomez)
yourdata[-grep(c('/'),yourdata$DOB),]
OR
yourdata[grep(c('/'),yourdata$DOB, invert=TRUE),]
181) geom_error_bar() tutorial (link1) (link2)
# example code using geom_errorbar() and geom_errorbarh()
theme_set(theme_classic())
theme_set(theme_gray())
i_y_line_threshold_intercept_atnf = 0.3661417
i_y_line_threshold_intercept_active = 0.7118644
gp <- ggplot(df_summary_aupr_signatures_RE, aes(x=aTNF,y=active_disease, color=signature))
#gp <- ggplot(df_summary_aupr_signatures_RE, aes(x=aTNF,y=active_disease, shape=signature, color=signature))
gp <- gp + geom_point(aes(size=number_genes)) # df_summary_aupr_signatures_RE, size=number_genes
gp <- gp + xlab("AUPR aTNF")
gp <- gp + ylab("AUPR active disease")
gp <- gp + geom_vline(xintercept=i_y_line_threshold_intercept_atnf, linetype="dashed")
gp <- gp + geom_hline(yintercept=i_y_line_threshold_intercept_active, linetype="dashed")
gp <- gp + geom_text_repel(data=df_summary_aupr_signatures_RE,
aes(label=signature), vjust=1,hjust=-0.2, min.segment.length = unit(1, "mm"),
color="black") #(data=show, aes(label=gene_name))
gp <- gp + geom_errorbar(data = df_summary_aupr_signatures_RE_2, aes(ymin=active_disease-1.96*sd_active/i_num_samples, ymax=active_disease+1.96*sd_active/i_num_samples), width=0.1 )
gp <- gp + geom_errorbarh(data = df_summary_aupr_signatures_RE_2, aes(xmin=aTNF-1.96*sd_atnf/i_num_samples, xmax=aTNF+1.96*sd_atnf/i_num_samples))#, width=0.1 )
# gp <- gp + xlim(0,1)
# gp <- gp + ylim(0.6,1)
gp <- gp + scale_size(range = c(2,10))
#gp <- gp + scale_x_continuous()
#gp <- gp + scale_y_continuous()
gp
ggsave(filename = "signature_genes.pdf", useDingbats=FALSE)
182) Converting weird date formats (adapted from stackoverflow and r-bloggers)
# date is Aug-08
x <- df_filename_scseq_infl_cluster$Last_colonoscopy[1]
x <- as.Date(paste("01-", x, sep = ""), format = "%d-%b-%y")
x <- format(x, "%d/%m/%Y")
df_filename_scseq_infl_cluster$Last_colonoscopy[1] <- as.character(x)
183) Summarize and aggregate data using the aggregate() command
#collapse expression values for genes with multiple probes
data$probe_id <- NULL
data$gene_symbol <- NULL
data.agg <- aggregate(.~gene_id, data, median)
rownames(data.agg) <- data.agg$gene_id
data.agg$gene_id <- NULL
184) Use of apply() to filter data for QC
# Expression filter
hist(data.matrix)
data.matrix <- data.matrix[apply(data.matrix,1,max)>4,]
print(dim(data.matrix))
# Variance filter
hist(apply(data.matrix,1,var), breaks=30)
data.matrix <- data.matrix[apply(data.matrix,1,var)>0.5,]
print(dim(data.matrix))
185) Barplot in ggplot
theme_set(theme_classic())
#theme_set(theme_gray())
gp <- ggplot(data=df_intercept, aes(x=gene_name, y=coeff))
gp <- gp + geom_bar(stat = "identity")
# gp <- gp + geom_bar()
# gp <- gp + facet_wrap(~gene_name)
gp <- gp + xlab("Gene name")
gp <- gp + ylab("Coefficient")
gp <- gp + coord_flip()
gp
186) parsing command line arguments (optparse)
library(optparse)
# Options ----
option_list <- list(
make_option(c("--seuratobject"), default="begin.Robj",
help="A seurat object after PCA"),
make_option(c("--clusterids"), default="none",
help="A list object containing the cluster identities"),
make_option(c("--outdir"), default="seurat.out.dir",
help="outdir")
)
opt <- parse_args(OptionParser(option_list=option_list))
outPrefix <- file.path(opt$outdir,"markers.summary")
s <- readRDS(opt$seuratobject)
saveRDS(df_combined_all_features, file = "source_data/df_combined_all_features.rds")
df_combined_all_features <- readRDS(file = "source_data/df_combined_all_features.rds")
187) RColorBrewer diverging palettes (link)
188) Negation of %in% operator (from link)
'%!in%' <- function(x,y)!('%in%'(x,y))
which(df_filename_scseq_infl_cluster$misc %!in% c('Y','N','NA'))
189) TRICK: Use as.numeric() will convert non-numeric to NA (link)
190) Check if a file exists
file.exists(str_file_name)
191) ggplot linear regression (link) (link)
theme_set(theme_gray())
#theme_set(theme_classic())
gp <- ggplot(data = df_colscore_metagenescore_noDUPL, aes(x=metagene_score, y=col_score))
gp <- gp + geom_point()
gp <- gp + geom_smooth(method = "lm", level=0.95) #se=TRUE)
# gp <- gp + geom_smooth(span=0.3)
gp
OR
ggplot(data1, aes(x=QUET, y=SBP))+
geom_point()+
geom_smooth(method=lm, se=TRUE)
192) PCA in R
pca <- prcomp(2^emat, scale=T)
#pca_probelevel <- prcomp(2^emat_probelevel, scale=T)
# summary of data
summary(as.vector(emat))
#summary(as.vector(emat_probelevel))
dim(emat)
#dim(emat_probelevel)
# PCA analysis on raw data
#pca = prcomp(log10(geneCore@assayData$exprs + 1))
#pca = prcomp(log10(geneCore@assayData$exprs), scale = TRUE)
#pca = prcomp(geneCore@assayData$exprs)#, scale. = TRUE)
# pca = prcomp(2^geneCore@assayData$exprs, scale = TRUE)
dim(2^geneCore@assayData$exprs)
# get rotation
d <- as.data.frame(pca$rotation)
#d_probelevel <- as.data.frame(pca_probelevel$rotation)
# PCA plot (not pretty)
plot(d$PC1,d$PC2)
#plot(d_probelevel$PC1,d_probelevel$PC2)
# generate scree plot
(summary(pca))$importance["Proportion of Variance",]
head(pca$sdev)
plot( (summary(pca))$importance["Proportion of Variance",],
xlab="component",
ylab="proportion of variance",
main="scree plot")
################################################
# what are top loading genes
################################################
head(pca$x[order(pca$x[,1]),])
head(pca$x[order(pca$x[,1], decreasing = TRUE ),] , n =20 )
#head(pca_probelevel$x[order(pca_probelevel$x[,1]),])
# order(pca$x[,1])
# idx = order(pca$x[,1]) # get indices of sorted array
# pca$x[idx,] # feed these indices into array to get sorted
# head(pca$x[idx,]) # head of that
#head(pca_probelevel$x[order(pca_probelevel$x[,1], decreasing = TRUE ),] , n =20 )
#AnnotationDbi::select(x = hta20transcriptcluster.db, keys = c("TC04001102.hg.1"), columns = c("GENENAME"))
#AnnotationDbi::select(x = hta20transcriptcluster.db, keys = c("TC12001162.hg.1"), columns = columns(x=hta20transcriptcluster.db))
#top_pca_loaded_genes <- head(pca$x[order(pca$x[,1], decreasing = TRUE ),] , n = 100 )
#AnnotationDbi::select(x = hugene10sttranscriptcluster.db, keys = c(rownames(top_pca_loaded_genes)), columns = c("GENENAME"))
# now look at all genes PCA loading sorted
#all_genes_pca_loading_sorted <- pca$x[ order(pca$x[,1], decreasing = TRUE ) ,]
# what are these genes
#AnnotationDbi::select(x=hugene10sttranscriptcluster.db, keys=c(rownames(all_genes_pca_loading_sorted)), columns = c("GENENAME") )
###############################################################################################
# Prettier plots using ggplot
###############################################################################################
# add a column for sample name OR stimulation condition etc (NOTE: d is a dataframe)
#d$sample = rownames(d)
# add a column in the dataframe for stimulation condition
# join by column name with original metadata
#d$stim = sqldf("select file_all_target_mappings_orig_LPS_LPSaIL10.STIMULATION as stim
# from file_all_target_mappings_orig_LPS_LPSaIL10
# inner join d
# on file_all_target_mappings_orig_LPS_LPSaIL10.SAMPLE = d.sample")
# ggplot magic to plot with labels
#gp <- ggplot(d$stim, aes(d$PC1, d$PC2, color=stim)) + geom_point(size=5)
#print(gp)
193) Return multiple arguments from function
function <- test()
{
# code here
return(list(
demographics = demographics,
diagnosis = diagnosis
))
}
# call function
list[demographics, diagnosis] <- test()
194) Charts, different kinds of plots and network diagrams using the networkd3 package in R (link)
195) Data munging like filter using dplyr and magrittr (courtesy Shanquan Chen)
# table command displays summary statistics (like frequency)
# filter filters the data
library(sqldf)
library(magrittr)
library(dplyr)
library(lubridate)
library(stringr)
mdat <- all_patients %$% table( Diagnosis_Code )
mdat <- data.frame(mdat)
mdat %>% filter(Diagnosis_Code == "F20")
sqldf::sqldf("select * from mdat where Diagnosis_Code like 'F20' ")
195) (CONTINUED FROM ABOVE) Pattern matching using stringr (courtesy Shanquan Chen)
mdat$Diagnosis_Code
mdat[stringr::str_detect(mdat$Diagnosis_Code, 'F20.*'),]
196) (CONTINUED FROM ABOVE) Data summarization using dplyr and lubridate (courtesy Shanquan Chen)
table command displays summary statistics (like frequency)
year in lubridate gets the year from a date
table(lubridate::year(mdat$Diagnosis_Last_date))
197) (CONTINUED FROM ABOVE) Convert data to long format using dcast (courtesy Shanquan Chen) (link)
df_meds_summarized <- dcast(df_records_patients_using_topMeds, rid + Death_Flag + Gender_Code ~ drug, length)
Also
dcast(drug_table, rid ~ drug, any)
Also
dcast(drug_table, rid ~ drug, fun.aggregate = function(d) {ifelse(any(d), 1, 0)})
Also see
drug_table <- data.table(
rid = c(rep(c('a', 'b', 'c'), each=3), 'a'),
drug = c('citalopram', 'insulin', 'simvastatin',
'clozapine', 'metformin', 'simvastatin',
'fluoxetine', 'mirtazapine', 'omeprazole', 'simvastatin')
)
# add a column value
drug_table$value <- TRUE
library(reshape2)
patient_table <- dcast(drug_table, rid ~ drug, any)
patient_table <- dcast(drug_table, rid ~ drug, fun.aggregate = function(d) {ifelse(any(d), 1, 0)})
patient_table <- dcast(drug_table, rid ~ drug, fun.aggregate = function(d) {ifelse(length(d) > 0, 1, 0)}, value.var = 'drug')
198) (CONTINUED FROM ABOVE) Alternative using data.table
# The problem:
# - to take a list of drugs, and map them to a list of patients,
answering the
# question: for each patient, does the drug occur?
library(data.table)
drug_table <- data.table(
rid = c(rep(c('a', 'b', 'c'), each=3), 'a'),
drug = c('citalopram', 'insulin', 'simvastatin',
'clozapine', 'metformin', 'simvastatin',
'fluoxetine', 'mirtazapine', 'omeprazole', 'simvastatin')
)
patient_table <- data.table(
rid = c('a', 'b', 'c'),
dummy = c(1, 2, 3)
)
setkey(patient_table, rid)
drug_frequencies <- drug_table[, .(n = .N), by = drug]
top_drugs <- drug_frequencies[n >= 1, drug] # silly
for (drugname in top_drugs) {
cat("Thinking about drug:", drugname, "\n")
count_this_drug_by_patient <- drug_table[drug == drugname, .(n =
.N), by = rid]
patient_table[
count_this_drug_by_patient, on = "rid",
# c(drugname) := n # number of occurences of the drug
c(drugname) := ifelse(n >= 1, 1, 0) # does the drug occur?
]
drugval <- patient_table[, get(drugname)]
patient_table[is.na(drugval), (drugname) := 0] # force NA to zero
}
# Note re data.table:
# ... trailing [] to prevent "doesn't print first time" bug:
#
# https://github.com/Rdatatable/data.table/blob/master/NEWS.md#bug-fixes-5
patient_table <- patient_table[] # fix nonprinting bug; see above
print(patient_table)
199) Connect to databases using the RODBC package (link)
200) All kinds of classification trees, regression trees, decision trees and random forests in R (link)
201) Making indicator variables
# I() function makes an indicator variable here the response 0 or 1
# For example
gam_m_fit_simple_logistic <- gam::gam(formula = I(wage>300) ~ s(year,df=4) + s(age,df=4),
family = binomial,
data = Wage)
202) Boxplot in R with formula
boxplot(frequency ~ attitude*gender, data=df_politeness, col=c("white","lightgray") )
203) ggplot axis label tilt to help with visibility (link)
# Use theme()
gp2 <- ggplot(df_patients_joined_meds_topdrugs, aes(x=drug))#,fill=Gender_Code))
gp2 <- gp2 + geom_histogram(stat = "count")#(alpha=0.5)
gp2 <- gp2 + xlab("Drug type") + ylab("Count") + title("Breakdown of data by type and name")
gp2 <- gp2 + facet_wrap(Gender_Code~.)
gp2 <- gp2 + theme(axis.text.x=element_text(angle = -90, hjust = 0))
# gp2 <- gp2 + facet_grid(drug~.)
gp2
204) R script for installing multiple packages (link)
205) Evaluate using eval() (link)
eval(parse(text = '5+5'))
# ANOTHER TWIST if you are calling it in a function and want this to persist in the environment
# that called the function
eval( parse(text = '5+5'), envir = parent.frame() )
# CONCEPT: parent.frame() gets the environment in the call stack above this call
see code here
206) Other dplyr functions
merge, mutate, select, arrange
207) tidy function in broom package to clean up or tidy data
library(broom)
tidy_lm_aq <- tidy(lm_aq)
tidy_lm_aq
208) The R package visdat can also be used for rapid data visualization (see link)
library(visdat)
vis_dat(airquality)
209) Use of apply() and paste() to concatenate or paste all columns of a data frame;
how to concatenate or paste all columns of a data frame (on stackoverflow)
apply(df_tcr_repertoire_age_1wk, 1, paste, collapse="")
210) Namespace like system to manage R functions (link)
For an example see (link)
util = new.env()
util$bgrep = function [...]
util$timeit = function [...]
while("util" %in% search())
detach("util")
attach("util")
211) Power calculation in R (link)
sample size
power.t.test
power.t.test(n = NULL, power = .95, sd = 5, alternative = "two.sided", sig.level = 0.001, delta = 0.1)
212) Create venn diagram and euler diagram (link) (link)
VennDiagram
eulerr
library(VennDiagram)
grid.newpage()
draw.triple.venn(area1 = 22, area2 = 20, area3 = 13, n12 = 11, n23 = 4, n13 = 5,
n123 = 1, category = c("Dog People", "Cat People", "Lizard People"), lty = "blank",
fill = c("skyblue", "pink1", "mediumorchid"))
213) Great list of functions for doing statistics and plotting (rlib)
# Load packages and settings
library(sqldf)
library(ggplot2)
library(knitr)
library(rmarkdown)
library(gplots)
library(RColorBrewer)
library(reshape2)
library(png)
library(grid)
library(gridExtra)
library(lme4)
library(lmerTest)
library(lubridate)
library(reshape2)
library(data.table)
library(pheatmap)
library(cba)
library(rstanarm)
library(shinystan)
library(rstantools) # make a rstanarm package
library(bayesplot)
library("factoextra") #install_github("kassambara/factoextra")
library(rpart)
REQUIRED_PACKAGES <- c(
"car", # may need: sudo apt install libgfortran3
"data.table",
"ggplot2",
"lubridate",
"RODBC",
"shinystan"
# "survival"
# "timereg"
)
LIBRARY_PREFIX <- "https://egret.psychol.cam.ac.uk/rlib/"
#source(paste0(LIBRARY_PREFIX, "datetimefunc.R"))
source("datetimefunc.R")
source(paste0(LIBRARY_PREFIX, "dbfunc.R"))
source(paste0(LIBRARY_PREFIX, "listfunc.R"))
source(paste0(LIBRARY_PREFIX, "listassign.R"))
source(paste0(LIBRARY_PREFIX, "miscfile.R"))
source(paste0(LIBRARY_PREFIX, "misclang.R"))
source(paste0(LIBRARY_PREFIX, "miscstat.R"))
source(paste0(LIBRARY_PREFIX, "stanfunc.R"))
source(paste0(LIBRARY_PREFIX, "cris_common.R"))
misclang$library_or_install(REQUIRED_PACKAGES)
# As advised by Stan:
rstan::rstan_options(auto_write = TRUE)
options(mc.cores = parallel::detectCores())
# Other misc commands
BIGSEP <- "===============================================================================\n"
SMALLSEP <- "-------------------------------------------------------------------------------\n"
214) ggplot with text within figure
require(ggplot2)
require(PRROC)
require(grid)
#################################################################################
# convert to ggplot
theme_set(theme_classic())
y <- as.data.frame(prroc_object$curve)
gp <- ggplot(y, aes(y$V1, y$V2))
gp <- gp + geom_path()
gp <- gp + xlim(0,1)
gp <- gp + ylim(0,1)
gp <- gp + geom_hline(yintercept=i_y_line_threshold_signif, linetype="dashed")
gp <- gp + xlab("Recall")
gp <- gp + ylab("Precision")
############
# add text
my_text <- paste("AUC = ", as.character(round(prroc_object$auc.integral, digits = 2)
))
my_grob = grid.text(my_text, gp=gpar(col="black", fontsize=14, fontface="bold"))
gp <- gp + annotation_custom(my_grob)
gp
215) Create combinations using combn or expand.grid
combn(c('a','b','c'), 2) # all combinations of 3 letters taken 2 at a time
combn(letters[1:4], 2)
expand.grid( height = seq(60, 80, 5), weight = seq(100, 300, 50), sex = c("Male","Female") )
216) Create a new data.table
sample_data <- data.table(subject_id = 1:100)
OR
dt_demographics <- data.table(
read.csv('demo.csv',
sep = ',', header = TRUE,
stringsAsFactors=FALSE, na.strings="..") # ,strip.white = TRUE)
)
setkey(dt_demographics, STUDY_SUBJECT_DIGEST)
OR
Create a new data.table and add columns
library(data.table)
library(lubridate)
library(tidyr)
START_YEAR <- 2016
END_YEAR <- 2017
# Define years of interest
years <- data.table(
year = START_YEAR:END_YEAR
)
# add a new column
years[, year_start_date := as.Date(paste0(year, "-01-01"))]
years[, year_end_date := as.Date(paste0(year, "-12-31"))]
MALE <- "M"
FEMALE <- "F"
# create another data.table
sexes <- data.table(sex = c(MALE, FEMALE))
age_ranges <- data.table(
age_range_start = c(0, 1, seq(5, 90, 5))
)
age_ranges[, age_range_end := age_range_start + 4]
OR
library(dplyr)
library(data.table)
df
%>%
mutate(date = as.Date(date)
%>%
as.data.table()
217) filter in data.table
# Filter to relevant years only:
subject_mortality_f20 <- subject_mortality_f20[
year >= lubridate::year(subject_start_date) &
year <= lubridate::year(subject_end_date)
]
218) Merge in data.table
# Merge in standardized mortality (create a new table, for clarity):
subject_and_population_mortality_f20 <- merge(
subject_mortality_f20,
population_mortality[, c("year", "sex", "age_range", "deaths_per_person_per_year")],
by.x = c("year", "sex", "age_range_this_year"),
by.y = c("year", "sex", "age_range")
)
219) Crossing or cross-product or all combinations in data.table
# Make a table with a row for each subject/year combination:
subject_mortality_f20 <- data.table(tidyr::crossing(sample_data_f20, years))
220) For each patient get minimum date and group by each patient (select min and group by in SQL) se using data.table (link)
# use .SD[1] construct
setkeyv(dt_tests_with_age, c("STUDY_SUBJECT_DIGEST", "ResultDate"))
# sort by age
dt_first_test_per_patient <- dt_tests_with_age[, .SD[1], by=c("STUDY_SUBJECT_DIGEST")] # *** check sorting
# .SD[1] gets first row
# set that to a new column
dt_first_test_per_patient$first_creatinine_datetime <- dt_first_test_per_patient$ResultDate
# then merge with original table
dt_egfr_with_age_withfirst <- data.table::merge.data.table(x = dt_tests_with_age,
y = dt_first_test_per_patient,
by = 'patid')
221) Create a summary of number of counts of a field (like select sum in SQL) (link)
# Create sum of tests for each patient using .N
dt_patients_with_two_creatinines <- dt_all_tests[
,
.(n_creatinine_tests = .N),
by = 'STUDY_SUBJECT_DIGEST' # by subject
]
# using .N
222) Saving memory by using pass by reference (:=) instead of pass by value (<-) in data.table
# 1st alternative in plain R using <-
dt_tests_with_age$age_calculated_asof_test <- as.numeric( dt_tests_with_age$ResultDate - dt_tests_with_age$date_of_birth_approx )
# 2nd alternative in data.table using :=
dt_tests_with_age[
,
age_calculated_asof_test := as.numeric(ResultDate, date_of_birth_approx)
]
223) Do inner join in data.table using merge (other joins like left join link)
# set primary key in data.table using setkey
setkey(dt_tests_with_age, STUDY_SUBJECT_DIGEST)
setkey(dt_first_test_per_patient, STUDY_SUBJECT_DIGEST)
dt_tests_with_age <- merge(dt_tests_with_age, dt_first_test_per_patient, by=c("STUDY_SUBJECT_DIGEST"))
224) Rename columns in data.table
setnames(dt_tests_with_age
,c("ResultNumeric.x", "age_calculated_asof_test.x", "GENDER_DESC.x" , "ETHNIC_GROUP_DESC.x")
,c("ResultNumeric" , "age_calculated_asof_test" , "GENDER_DESC" , "ETHNIC_GROUP_DESC" )
)
225) Get counts (like count(distinct(*)) in SQL) using uniqueN in data.table
dt_demographics <- data.table(
read.csv('demo.csv',
sep = ',', header = TRUE,
stringsAsFactors=FALSE, na.strings="..") # ,strip.white = TRUE)
)
setkey(dt_demographics, STUDY_SUBJECT_DIGEST)
n_patients <- uniqueN(dt_demographics$STUDY_SUBJECT_DIGEST)
# get counts and group by year of birth and gender (courtesy Rudolf Cardinal)
# like count(distinct()) ….. group by yob, gender
dt_merged_treatment[, .(n_lithium_patients = .N), by = .(yob, gender)]
# can create a new column based on this using :=
dt_merged_treatment[, n_lithium_patients := .(n_lithium_patients = .N), by = .(yob, gender)]
# if any errors use the conflicted package
library(conflicted)
conflict_prefer("list", "gsubfn")
# now select only those with 2 lithium_patients i.e select count(n_lithium_patients) as n …..
dt_selected <- dt_merged_treatment[ n_lithium_patients >= 2 ]
226) Union of two tables (like union in SQL) using rbindlist (link)
list_tables <- list(dt_epic_labtests, dt_meditech_labtests)
dt_all_tests <- data.table::rbindlist(list_tables, use.names = TRUE, fill = TRUE)
225) Read excel in R (readxl)
# Read in real ONS data
library(readxl)
smr_table_ons <- readxl::read_xls(path = 'deathsummarytables2017final.xls', sheet=4)
226) Make or turn a matrix into a vector in row-major format (link)
vector_ons_mortality_2016_2017 <- as.vector(t(mat_ons_mortality_2016_2017))
227) ifelse command
df_time_to_event$time_to_event <- ifelse(df_time_to_event$Death_Flag,
as.numeric(df_time_to_event$Date_Of_Death_lubridate - df_time_to_event$Date_Of_referral_lubridate)/DAYS_PER_YEAR,
as.numeric(END_DATE_CONSTANT_lubridate - df_time_to_event$Date_Of_referral_lubridate)/DAYS_PER_YEAR
)
228) Memory free up size in R (link) (other tips to free up memory)
memory.limit()
memory.limit(size = 4095) # in MB
gc() # garbage collector
229) Randomly sample lines from a big csv file (link)
# randomly sample a few patients
set.seed(12)
i_NUM_PATIENTS_SAMPLE = 10000
temp_list_row_numbers = 1:nrow(dt_demographics)
# now sample some patients from them
i_temp_sample_patient_index = sample(temp_list_row_numbers, i_NUM_PATIENTS_SAMPLE)
# CAUTION: overwrite demographics table
dt_demographics <- dt_demographics[i_temp_sample_patient_index, ]
230) Dropfactors or droplevels (link)
df_temp_summary <- droplevels(df_temp_summary)
231) Remove duplicate columns (link)
df_time_to_event <- subset(df_time_to_event, select=which(!duplicated(names(df_time_to_event))))
232) Get the last 3 digits of a string (link) courtesy Shanquan Chen
# use stringr
x <- "some text in a string"
substrRight <- function(x, n){
substr(x, nchar(x)-n+1, nchar(x))
} Convert string to lower case str_to_lower() courtesy Rudolf Cardinal 233) Two strategies to merge, join, inner join (courtesy Rudolf Cardinal) dt_case_control_matched <- data.table::merge.data.table(dt_merged_treatment[i_temp_index,],
dt_merged_control,
#allow.cartesian = TRUE,
by=c("yob","gender","region")
)
# more efficient
dt_case_control_matched <- dt_merged_control[yob == dt_merged_treatment[i_temp_index,]$yob
&
gender == dt_merged_treatment[i_temp_index]$gender
&
region == dt_merged_treatment[i_temp_index]$region
]
234) Very advanced data.table (code) (stackoverflow)
Set column value using the first next row in the same group that meets a condition
# Adapted from
# https://stackoverflow.com/questions/55070842/set-column-value-using-the-first-next-row-in-the-same-group-that-meets-a-conditi?noredirect=1&lq=1
dt <- structure(list(
id = c(1L, 1L, 2L, 2L, 3L, 4L, 5L, 5L, 5L, 6L, 6L, 6L),
code = c("p", "f", "f", "p", "p", "<NA>", "f", "p", "p", "f", "p", "p"),
date_down = structure(c(17897, 17898, 17898, 17899, 17900, 17901, 17903, 17903, 17905, 17906, 17906, 17906), class = "Date"),
date_up = structure(c(17898, 17899, 17898, NA, NA, 17901, 17904, 17904, 17905, 17906, 17906, 17907), class = "Date")),
class = c("data.table", "data.frame"),
row.names = c(NA, -12L))
data.table::setDT(dt) # to reinit the internal self ref pointer (known issue)
dt[, row := .I] # add row numbers (as in all the solutions)
data.table::setkey(dt, id, row, date_down) #set key for dt
head(dt)
# TODO: do for each patient patid == temp_patient_id
# For all rows of dt, create found_date by reference :=
dt[, found_date :=
# dt[code=='p'][dt,
dt[code=='p'][.(.SD), # our subset (or another data.table), joined to .SD (referring to original dt)
.( x.date_up ),
on = .(id==id, row > row, date_up > date_down),
mult = "first"] ]
head(dt)
View(dt)
235) Remove rows in data.table
# remove those rows marked for removal
dt_egfr_with_age_merged_lithium <- dt_egfr_with_age_merged_lithium[ remove_row == 0, ]
235) Leave one out cross validation in R using the loocv package (link)
236) Create an empty data frame initialise an empty data frame of a certain size (from stackoverflow)
mat_empty_matrix <- matrix(data = 0, nrow = 100, ncol = 21 )
df_empty <- data.frame( mat_empty_matrix, stringsAsFactors = FALSE)
235) Convert a data frame to a matrix using data.matrix
236) Show fit of linear model lm() using ggplot
abline
237) Testing using testthat (VERY GOOD link) (link) (link)
# very simple example below
library(testthat)
library(devtools)
string <- "Testing is fun! hellooooo..."
expect_match(string, "Testing")
expect_match(string, 'hello', ignore.case = TRUE)
# expect_output(as.character(string), 'hello')
238) Making a package in R using devtools (link)
devtools::test()
devtools::test(filter = 'test.R') # where test.R is below
library(testthat)
string <- "Testing is fun! hellooooo..."
expect_match(string, "Testing")
expect_match(string, 'hello', ignore.case = TRUE)
239) Check where libraries are stored
.libPaths()
240) Read in and write Stata (.dta) files
library(haven)
library(tidyverse)
a <- haven::read_dta(file = 'test.dta')
haven::write_dta(data = b, path = '~/mod.dta')
241) R environments and calling scope and lexical scope (link)
242) Initialize a matrix with zeros (link)
matrix(data = 0, nrow = 10, ncol = 10)
243) Bookdown in R
Simple example (link)
244) Write the output of a model summary or any model to a log file
str_log_filename = 'log_model1_offals.csv'
sink(str_log_filename)
print(summary(meta_model))
sink()
245) Append to list incrementally
list_input_covariate = c('BMI')
list_input_covariate = cbind(list_input_covariate, 'AGE')
list_input_covariate = cbind(list_input_covariate, 'GENDER')
OR
list_input_covariate = NULL
list_input_covariate = cbind(list_input_covariate, 'AGE')
list_input_covariate = cbind(list_input_covariate, 'GENDER')
246) How to make a package in R (link, github, youtube)
247) Generate synthetic data in R
simstudy package
https://cran.r-project.org/web/packages/simstudy/vignettes/simstudy.html library(simstudy) # generate definition def <- defData(varname = "age", dist = "normal", formula = 10, variance = 2) def <- defData(def, varname = "female", dist = "binary", formula = "-2 + age * 0.1", link = "logit") def <- defData(def, varname = "visits", dist = "poisson", formula = "1.5 - 0.2 * age + 0.5 * female", link = "log") # generate data dd <- genData(1000, def) dd
248)Plotting CDF cumulative distribution function using stat_ecdf() in ggplot
# age should be a factor
gp <- ggplot(data = df_all, x = energy, aes(x = energy, color = age_factor)
gp <- gp + geom_step(aes(y=..y..), stat = "ecdf")
gp
249) R profiler (link, link) (code)
Rprof
profviz
250) qgraph for plotting graphs in R (link)
251) bookdown
Recipe by Tom Bishop
Bookdown recipe:
In RStudio, install the bookdown package
install.packages('bookdown')
Start a new project, but choose the project to be a book.
In Github create an empty project (no README.md) with the same name as your book
In RStudio go to Tools -> Project options -> Version control -> Git (it will ask you to reload the project)
On Git command line, add the remote and push: git remote add origin https://github.com/USERNAME/PROJECT.git git push -u origin master
In the file _bookdown.yml, and the line output_dir: "docs"
In the file _output.yml, delete the parts that refer to generating a pdf (I couldn’t get this to work):
bookdown::pdf_book: includes: in_header: preamble.tex latex_engine: xelatex citation_package: natbib keep_tex: yes bookdown::epub_book: default
Check in local changes
In Github go to Settings -> Pages and set the source to the main branch and folder to docs
252) sessionInfo()
to get all details related to session, what version packages are installed, version, etc.
253) Display pretty formatted R code in rmarkdown
```{r eval=FALSE}
x = y
```
254) Object oriented programming class in R S3 and S4 class (link)
255) Create inst/CITATION file for citing R package (link)
library(usethis)
usethis::use_citation()
256) Add a dialog box or popup box in shiny R
https://stackoverflow.com/questions/48523338/how-can-i-add-a-message-box-in-r-shiny
shiny::showModal(modalDialog(title = "Minimum range cannot be greater than maximum for p",))
257) Set working directory to current directory
setwd(dir = getwd())
258) Rmarkdown class (link)(github)
https://ac812.github.io/reproducibility-training/rmarkdown.html