R Basics

setwd("C:\\") #set working directory (Note: Need to use \\)
test=read.csv( "file.csv" , sep="," , header = TRUE, col_types="#c for character, d for double, - for not to read col", n_max="#number of rows to read") #read files in csv (comma) / .csv2 (tab) / tsv (tab)
x=read.table("#dir", sep=",", header = TRUE) #read table with sep="," or "\t" or "@" and import data

Exporting data

write.table(x,"file.csv",sep=",") #export data with .csv or .xls or .txt

Data Types

Things to look out

Format and dimension of the data
Variable names and how they are stored
Missing data? Flawed data?

Data types

x=1 is scalar, y=c(1,2,3) is vector, z = matrix(c(), nrow, ncol)
- z[] --> Select all
- z[x,y] --> Select specific row x and col y
- z[c[x1,x2], c[y1,y2]] --> Output of 4 elements (x1y1, x1y2, x2y1, x2y2)
x = c(#e.g. 1:5) #numeric vector of 1 to 5
x = list (c(#e.g. 1:5),"e.g. Hello World") #list
#as.numeric(x) #as.logical(x) #as.character(x) #as.complex(x) #change type
object.size(#matrix) #file size
#list = c(#data, stringsAsFactors=FALSE) #prevent R from converting column to string

List all variables

ls()

Assigning values

x[e.g. 1]=3
x[e.g. 1,2]=3 #add 3 to row 1 col 2

Referencing

with(#matrix, function) #No need to repeat name of table
attach(#df) #reference to data frame so that no need to repeat
- detach(#df)
#dataframe = subset(#df, #df$#col=='#value', select = c('#col')) #create subset

Equality

==
!=
& #evaluate ALL elements with AND
&& #evaluate first element with AND
| #evaluate ALL elements with OR (i.e. at least 1 element is true)
|| #evaluate first element with OR (i.e. at least 1 element is true)
xor() #exclusive OR, i.e. xor(FALSE, TRUE)=TRUE, xor(TRUE, TRUE)=FALSE
isTRUE(x > y)
which(x) #list element number of TRUE
any(x) #check if any element meets the criterion
all(x) #check if all elements meet the criterion

Test for finite / infinite data

is.finite(x)
is.infinite(x)

Error

NA: Not available (i.e. missing data)
NaN: Not a number error (e.g. 0/0)
Inf

Test for error cases

is.na(x) #is.na(x) is not the same as x==NA
is.nan(x)
anyNA(x) #test if there is any NA in array
complete.cases(x) #test for opposite of na
#mean(#df, na.rm = TRUE) #calculate with NA removed (values still stay)
na.omit(#df) #remove rows with NA

clean<-complete.cases(x); x[clean] #remove NA

Mathematical Operations

% #modulus
sqrt()
abs()
round()
factorial()
sample(#range of values e.g. c(0,1), #number of toss, replace = TRUE, #skewed probability e.g. c(0.3, 0.7)) #random numbers with replacement after each toss
unique(x) #unique values

User-defined operations

"%#var_name%" <- function()

Matrix

Creation of matrix

x=matrix(c(e.g. 1:10),nrow= e.g. 2,ncol= e.g. 5) #create matrix
x=data.frame(x=c(e.g. 1:4),y=c("e.g. Hello","e.g. World")) #create matrix of different data types
x=matrix(rep(#value,#number_of_repetition),ncol=e.g. 2,nrow=e.g. 3) #create repeated values in matrix
x=rep(c(#value1, #value2), #repetition) #create repeated values
x=rep(c(#value1, #value2), each=#repetition of same value) #create repeated value 1 for x times before repeating next value 2
x=matrix((e.g. 5:10), e.g. 2, e.g. 3) #create matrix from 5 to 10 with 2 rows, 3 cols
x=seq(e.g. 1,e.g. 10,by = e.g. 2) #create sequence of values from 1 to 10 with increment of 2
seq_along(#length) #create a sequence
x=matrix(rnorm(#numberofrand),#row,#col) #create matrix with random numbers
- set.seed(#e.g. 5) #prevent random numbers from re-generated each time
x <- vector("numeric", length = e.g. 10) #create an empty vector 0 0 0 0 0 0 0 0 0 0
x<- c(#text1 = #number1, ..., #text = #number) #create each row of text and numbers
x[#element number:#element number] #display elements in a matrix
x[x e.g. > 0] #display elements in a matrix that is more than 0, including NA
x[e.g. !is.na(x) & x e.g. > 0] #display elements in a matrix that is more than 0
x[- c(#element number)] #display all values except for a particular element number

Adding and removing rows and columns

x$colname=c(e.g. 1,2,3,4) #add col
colname=c(e.g. 1:10); x=cbind(x,#colname)) #add col
rowname=c(e.g. 1:10);x=rbind(x,#rowname)) #add row
x=x[- #rownumber,- #colnumber] #remove row and col

Renaming and reordering columns

colnames(x)=c("#col1", "#col2") #rename col
x=x[c(#colnumber, #colnumber, #colnumber)] #reorder columns

Summary of matrix

names(x) #view column names
head(#datatablename) #view top 6 rows of data
head(#datatablename,#rowtoview) #view top 2 rows of data
tail(#datatablename,#rowtoview) #view bottom 2 rows of data
colnames(x)=c("#old col name","#new col name") #rename col name
table(#table$#colname) #display summary of col
str(#matrix) #view structure of matrix with data type, row name, col name and values

ncol(x) #count number of columns
nrow(x) #count number of rows
dim(x) #count dimension of matrix
length(x) #length
summary(x) #view summary (min, 25%, 50%, mean, 75%, max) for each col
quantile(x) #view summary (0%, 25%, 50%, 75%, 100%) for each col
which.max(x) #find maximum value

Subsetting

subset(#matrix, #condition)

Mathematical Operations

z=x+y #matrix addition
x=3*x #scalar multiplication
z=x*y #matrix element-wise multiplication
z=x%*%y #sum of matrix inner scalar multiplication (1 number)
z=x%o%y #sum of matrix outer multiplication
det(x) #determinant
solve(x) #inverse
identical(x, y) #check if vectors are identical
distinct(x) #number of unique values

t(x) #transpose

x=diag(3) #generate identity matrix of 3x3

Normal z-distribution

qqnorm(x) #qq graph plot (quantile Data Quantiles VS Uniform OR T OR Normal Dist Quantiles of equal size from -1 to 1)
qqline(x) #straight line if it's normally distributed
shapiro.test(x) #Test for normality

qnorm(#0.975) #get quantile value (z-score value) in 1 tail for e.g. 95% confidence interval to return 1.96
pnorm(#1.64) #get probability value in 1 tail for e.g. 1.64 z-score to return 90%
limit = mean(#x) + qnorm(#0.975) * (sd(#x) / sqrt(length(#x))) #confidence interval
sd(x) #standard deviation
sd(x) / sqrt(length(#x)) #standard error

curve(dnorm(#x, mean(#x), sd(#x)), add = TRUE, col = "#red", lwd = 2) #plot approximate density normal function
- lines(density(#x), col = "#blue") #plot actual smooth curve

T distribution (Cater to small sample size, both equal and unequal variance)

qt(p = #0.975, df = #sample size - 1) #get quantile value (t-score value) in 1 tail for e.g. 95% confidence interval
pt(#1.64, df = #sample size - 1) #get probability value in 1 tail for e.g. 1.64 t-score
limit = mean(x) + qt(p = #0.975, df = #sample size - 1) * (sd(x) / sqrt(length(x))) #confidence interval
cov(x,y) #covariance
cor(x,y) #correlation
colMean(#matrix) #display column mean

Proportion Test

p = c(#1, #2)
#total = c(#1, #2)
prop.test(#p, #total, alternative = "#less") #var 1 is less than #2

Bayesian Probabilistic Inference (Predictive Analytics)

Function

function_name <- function(#var1, #var2) { }
e.g. function_name(function(x){#return value}, #x-value)

If... Else...

if(#statement==#condition1) { print("#text1") } else { print("#text2") }

Loop

for (i in e.g. 1:10){ e.g. z[i] = x[i]*y[i] }
replicate(#number of replicates, function) #repeat function

Apply

apply(#matrix, #1 for col or 2 for row, #function) #function loop
lapply(#matrix, #function) #function loop for list
lapply(#matrix, #function(x) x[n]) #function loop for list - return element n
sapply(#matrix, #function) #function loop for list in a simplified one-liner
vapply(#matrix, #function, #datatype e.g. numeric(1)) #function loop for list in a simplified one-liner (safer than sapply by specifying the type of output expected)
as.character(class(#matrix)) #return type of var in matrix in a list
tapply(#datatable$colname, #datatable$groupbycolname, #function) #function loop for vector and group by category

Nested function

return (e.g. fx)

Probability functions

r*** for "random"
- rbinom (#number of variable 1, #number of flips size = e.g. 100, prob = e.g. 0.7) #binomial distribution
- rnorm(#number of variable e.g. 10, mean = e.g. 100, sd = e.g. 25) #normal distribution
- rpois(#number of variable e.g. 10, #mean) #Poisson distribution
- rexp() #exponential
- rchisq() #chi-squared
- rgamma() #gamma
d*** for "density"
p*** for "probability"
q*** for "quantile"

Text operations

Concatenation

paste(x, collapse = "#character to add in between text of entire result") #concatenate
paste("#value1", "#value2", sep = "#character to add in between text of each term") #concatenate
paste("#value1", "#value2", sep = "#character to add in between text") #concatenate first element of vector 1 with that of vector 2 before next element
paste0("#value1", "#value2") #concatenate without separator
paste("#statement", #vector) #print each statement with an element

Formatting and counting

names(x) #display text
nchar(x) #count number of characters
toupper(x) #UPPERCASE
tolower(x) #lowercase
str_to_title(x) #sentence case
str_trim("#string") #trim whitespace from the ends of the string (Note: Not inside)
LETTERS #all alphabetical characters in vector

Search

grepl(#search value, #string) #search if string contains search value
grepl("#start . #end", #string) #search if string contains search value beginning with #start and ending with #end
grepl("(#search value) {#min adjacent repeat occurrence, #max adjacent repeat occurrence}", #string) #search if string contains search value (Note: search value can be (x.{#occurrence}) {#occurrence}
word("#string", #position of word to extract) #extract word from sentence

Metacharacter

. Any Character
\\w #A Word (letter, digit, underscore)
\\W #Not a Word
\\d #A Digit (0 to 9)
\\D #Not a Digit
\\s #Whitespace (line breaks, tabs, spaces)
\\S #Not Whitespace
\\#special characters #Special characters
[xyz] #A Set of Characters
[^xyz] #Negation of Set (i.e. all except xyz)
[a-z] #A Range of Characters from a to z
^ #Beginning of String
$ #End of String
\n #Newline
+ #One or More of Previous
* #Zero or More of Previous
? #Zero or One of Previous
| #Either the Previous or the Following
{5} #Exactly 5 of Previous
{2, 5} #Between 2 and 5 or Previous
{2, } #More than 2 of Previous
x|y #search for x or y

E.g.
- #matrix( grepl(#start_end_with_vowel "^[AEIOU]{1}.+[aeiou]{1}$", #matrix) ) #display list of words that starts with capitalized vowels and end with lowercase vowels

Substitution

sub("[#search value]", "#new value", #vector) #substitute first instance
gsub("[#search value]", "#new value", #vector) #substitute ALL instances
str_pad("#text", width = #total length of string e.g. 8, side = "#adding char from e.g. left", pad = "e.g. -") #add char - from the left so that total length is e.g. 8

Splitting and extracting strings

strsplit(#vector, "#search value")
str_extract("#vector", "#search value e.g. [0-9]+")

Arranging strings in order

str_order(#vector)

Unpacking arguments

args <- list(...)
[Cont'd] x<- args[["x"]]

Date and Time

Time is wrt 1970-01-01
Sys.Date() #YYYY-MM-DD (Date)
Sys.Time() #YYYY-MM-DD HH:MM:SS +UTC (POSIXct or POSIXlt)
x$min #display min
strptime(x, "%B %d, %Y %H:%M") #convert time
difftime(Sys.time(), x, units = #type of time passed 'days') #difference between time

Graph plotting

https://www.r-graph-gallery.com/all-graphs/

Packages

install.packages("ISLR") #scatterplot
install.packages("plotrix") #waterfall
install.packages("dendroextras") #dendrogram
- install.packages("NeatMap")
- install.packages("specular")
- install.packages("dynamicTreeCut")
- prcomp(, scale=TRUE) + biplot(pr.out, scale = 0) #plot Principle Component Analysis for 4 quadrants
install.packages("pheatmap") #heat map
install.packages("portfolio") #tree map in map.market()
install.packages("ggogleVis") #interactive plot
- gVisScatterChart
- gvisBarChart
- gvisLineChart
- gvisTimeline
- gvisHistogram
- gvisBubbleChart
- gvisTreeMap
- gvisGeoMap
- gvisMotionChart
brewer.pal(#number of shading, "#e.g. Greens")) #colour palette

Formatting

#data = #data[order(#data$col, decreasing = TRUE)] #order data
par(bg = "e.g. white", las = #axis label_type e.g. 1, col.lab = "#col label colour e.g. black", col.axis = "#col axis colour e.g. white", bty = "e.g. n", cex.axis = #axis size e.g. 0.9, cex.lab = #label size e.g. 1.5, mar = c(#margin for bottom, #left, #top, #right))
par(mfrow = c(#row,#col), mar = c(#bottom, #left, #top, #right)) #divides plot area into rows and columns with margins
gvisMerge(#graph1, #graph2, horizontal = TRUE) # Display graphs side by side
sysfonts::windowsFonts(#font = windowsFont("#font name)) and par(family="#font") #Change font
xkcd #xkcd theme

Scatterplot

plot(x=#data$colname for x axis, y=#data$colname for y axis, col = rainbow(#colours) OR col = c(#colour based on group), pch = #?points type of point coloured circle 20, xlim=c(0,max(#data$colname for max x-value)), type = "o" (to connect points), main = "#title", sub= "#subtitle under x-axis", xlab = "#x-axis label", ylab ="#y-axis label")
legend("topright",cex = #size e.g. 0.6, fill = c("#colour e.g. red for var1","#colour e.g. black for var 2"), legend = c("#var1" ,"#var2"))

abline(h=#value or v=#value, lwd = #line width e.g. 1, lty = #type of line e.g. 4 for dotted, col = "e.g. red") # create horizontal or vertical line
text(#data$colname, pos = e.g. 2, offset = e.g. 0) #add text to graph
title(main = "#title") #add title to graph

scatter.smooth(#data$colname for x axis, #data$colname for y axis, pch = type of point e.g. coloured circle 20, lpars = list(lty = e.g. 3, col ="e.g. black", lwd = e.g. 2)) #smooth Locally Weighted Scatterplot Smoothing line

Histogram

hist(#matrix$col) #histogram
rug(#matrix$col) #histogram with density of tick marks

Barchart

barplot(data, main="#title", xlab="#x-axis label", col=c("#barcolour e.g. darkblue","#barcolour e.g. red", "#barcolour e.g. yellow"), beside=#stacked FALSE or side-by-side TRUE, legend = rownames(data))

Boxplot

boxplot(formula = #y ~ #x, data = #data, col = "colour e.g. red")

Dendrogram

#d = dist(#dataframe, method = "euclidean")
hclust(#d, "ave") #hierarchical clustering for no prior knowledge of underlying cluster

Tree Map

gvisTreeMap (#data, idvar = "#grid label col", parentvar = "#drill down col", sizevar = "e.g. X2009", colourvar = "#display col", options = list(width = #e.g. 300, height = #e.g. 100, showScale = TRUE, maxColour = "#hex colour", minColour= "#hex colour", title = "", fontColour = "e.g. black")
data$col1 [data$col2 == ""] = NA #Assign NA for row without relevant data

Pie Chart

pie(round (#data / sum(#data)) * 100), col = e.g. #colour, radius = #e.g. 1, init.angle = #e.g. 90, clockwise = TRUE, labels = paste(c(#labels), "#e.g. %"), main = "#e.g. Title")

install.packages("plotrix")
library(plotrix)
floating.pie(#x coordinate for centre of pie e.g. 3, #y coordinate for centre of pie e.g. 3, #e.g. pie data, radius = #e.g. 1, col = #colour, startpos = #starting position in radians e.g. 4)
pie.labels(#x coordinate for centre of pie e.g. 3, #y coordinate for centre of pie e.g. 3, #angles e.g. pie chart, radius = #less than 1 to be plotted inside e.g. 0.4, labels = #e.g. labels)
legend("#e.g. left", fill = "#e.g. colour", legend = "#e.g. legend")

draw.circle(#x coordinate for centre of pie e.g. 3, #y coordinate for centre of pie e.g. 3, radius = #e.g. 0.5, col = "#e.g. white") #additional code for donut chart

Fan Chart

library(plotrix)
fan.plot(round (#data / sum(#data)) * 100), col = e.g. #colour, max.span = pi, align = "#e.g. left, labels = paste(c(#labels), "#e.g. %"), main = "#e.g. Title")

Slope Chart

library(plotrix)
bumpchart(#data in e.g. columns 3 and 4 [, 3:4], lwd = "#line width e.g. 2", col = ("#hex"), top.labels = NA, rank = FALSE)
boxed.labels (#x coordinate e.g. 1, #y coordinate e.g. 11, labels = c(#col), col = "e.g. blue", border = FALSE)
boxed.labels (#x coordinate e.g. 2, #y coordinate e.g. 11, labels = c(#col), col = "e.g. red", border = FALSE)

3D Plots

plot3D::scatter3D() #3D scatterplot
rgl::plot3D() #interactive 3D scatterplot

plotrix::pie3D() #3D piechart

plot3D::hist3D() #3D histogram

plot3D::ribbon3D() #3D ribbon plot

plot3D::contour3D() #3D contour plot
plot3D::image2D() and persp3D() #3D contour and flat surface plot
plotrix3D::surf3D() #3D surface plot

animation::saveGIF() #animation of plot

Multidimensional Plots

HistData::sunflowerplot() #Plot density of data as sunflower diagram in scatterplot
hexbin::hexbin() #Plot density of data as hexagon in scatterplot
googleVis::gvisCalendar() #Plot density of data in calendar

plotrix::pyramid.plot() #Horizontal bar plots (e.g. gender differences)

plotrix::radial.plot() #Radial Line Plot
HistData::radial.pie() #Plot Coxcomb / Polar Clock Plots

igraph::plot() #Network Plot (E(g) for edges, V(g) for vertices)
igraph::tkplot() #Interactive Network Plot

aplpack::faces() #Plot Chernoff faces (e.g. macroeconomics data)

Continuous Data Plots

corrplot::cor() and corrplot() #Correlation Plot
corrplot::cor() and corrplot(method = c("#e.g. ellipse / number / square"), order = "#e.g. hclust", addrect = #number of clusters e.g. 4, rect.col = "#e.g. blue")) #Correlation Plot with Clusters
ISLR::lm() #Regression Line

ISLR::boxplot() #Box and Whiskers Plot

quantmod::qqline() and qqmod() with shapiro.test() #Quantile-Quantile Plot (e.g. deviation from normal distribution)

ISLR and vioplot::vioplot() #Violin Plot

ts() and decompose() #Decomposed Time Series
acf() #Correlogram (Auto Correlation Function ACF Plot or Autocorrelation plot)
forecast and fanplot::auto.arima(), simulate(), and fan() #Forecast using Autoregressive Integrated Moving Average Model (ARIMA)

quantmod::chartSeries() #Candlestick Plot (e.g. sock prices for opening, closing, high and low)
googleVis::gvisCandlestickChart() #Interactive Candlestick Plot

quantmod::lines() with hist() #Density Plot in Histogram

Word Cloud and Text Mining

wordcloud::wordcloud(#text, min.freq=#plot all words using e.g. 1, scale = c(#e.g. 2, 0.5), random.color = #e.g. TRUE, colour = #e.g. pal) #Word Cloud

tm::Corpus(DirSource("#dir")) #Open text files in directory
tm::tm_map() #Text mining tool to remove numbers, punctuations etc
tm::findFreqTerms(#data, #frequency e.g. 14) #Text mining tool to list words that appear n times
tm::findAssocs(#data, c(#word1, #word2), #correlation e.g. 0.5) #Text mining tool to list words that are related to terms with lower correlation limits

XML::xmlParse(#data), xpathSApply(#data, "//#value", xmlValue)#Parse XML data

Time Series

autoplot(#ts, facet = TRUE) #Plot time series
- Trends induce positive correlations in the early lags.
- Seasonality will induce peaks at the seasonal lags.
- Cyclicity induces peaks at the average cycle length.
Types of forecasting
- naive(#TS, #forecast length) #use most recent observation
- snaive(#TS, #forecast length) #seasonal
- Simple exponential smoothing
  - fc <- ses(#TS, h = 10) #Use ses() to forecast the next 10 years
  - summary(fc) #Use summary() to see the model parameters
  - autoplot(fc) + autolayer(fitted(fc)) # Add the one-step forecasts for the training data to the plot
- Holt's trend method
  - fcholt <- holt(#TS, damped=FALSE, h = 10) #Produce 10 year forecasts using holt()
  - summary(fcholt) #Look at fitted model using summary()
  - autoplot(fcholt) #Plot the forecasts
  - checkresiduals(fcholt) #Check that the residuals look like white noise
Holt-Winter's trend, seasonality method
Holt-Winter's trend, multiplicative seasonality method
- fc <- hw(#TS, seasonal = "multiplicative", h = 3) #Produce 3 year forecasts
- checkresiduals(fc) #Check if residuals look like white noise
- autoplot(fc) #Plot the forecasts
Box-Cox transformation
- BoxCox.lambda(#TS) #find out lambda value for transformation
auto.arima()
- farima <- function(x, h) {
- forecast(auto.arima(x), h=h)
- } #Set up forecast functions forARIMA models
- e1 <- tsCV(#TS, farima, h=1) #Compute CV errors for ARIMA
- mean(e1^2, na.rm = TRUE) #Find MSE of each model class
- #TS%>% farima(h=#forecast)%>% autoplot() #Plot forecasts using the best model class
Dynamic Regression
- autoplot(#TS[, c(#col1, #col2)], facets = TRUE) #Time plot of both variables
- xreg <- cbind(#colname = #TS[, "#col"]) # Matrix of regressors
- fit <- auto.arima(#TS[, "#col"], xreg = xreg, stationary = TRUE) #Fit ARIMA model with stationary data
- coef <- coefficients(fit)['xreg'] #Check model for coefficient of regression
- fc <- forecast(fit, xreg = cbind(#xreg1, #xreg2) OR rep(#value, #rep)) #Forecast fit as fc
- autoplot(fc) + xlab("#X") + ylab("#Y") #Plot fc with x and y labels
Dynamic Harmonic Regression
- harmonics <- fourier(#TS, K = #) #Set up harmonic regressors of order #
- fit <- auto.arima(#TS, xreg = harmonics, seasonal = FALSE) #Fit regression model with ARIMA errors
- newharmonics <- fourier(#TS, K = #, h = #forecast length) #Forecasts next h years
- fc <- forecast(fit, xreg = newharmonics)
- fc %>% autoplot() #Plot forecasts fc
Multiple seasonality
- fit <- tslm(#TS ~ fourier(#TS, K = c(#, #))) #Fit a harmonic regression using order # for each type of seasonality
- fc <- forecast(fit, newdata = data.frame(fourier(taylor, K = c(#, #), h = #h))) #Forecast 20 working days ahead
- autoplot(fc) #Plot the forecasts
- checkresiduals(fit) #Check the residuals of fit #Can use forecast even if the test fails
Daily booking
- xreg <- fourier(#TS, K = c(#daily period, #weekly period order)) #Set up the xreg matrix
- fit <- auto.arima(#TS, xreg = xreg, seasonal = FALSE, stationary = TRUE) #Fit a dynamic regression model
- checkresiduals(fit) #Check the residuals
- c <- forecast(fit, xreg = fourier(#TS, c(#daily, #weekly), h = #h)) #Plot forecasts for working days ahead
- autoplot(fc) #Plot It is often unrealistic to have residuals that pass the tests for such long series. The effect of the remaining correlations on the forecasts will be negligible.
TBATS
- Trigonometric terms for seasonality
- Box-Cox transformations for heterogeneity
- ARMA errors for short-term dynamics
- Trend (possibly damped)
- Seasonal (including multiple and non-integer periods)

autoplot(#TS) #Plot the data
fit <- tbats(#TS) #Fit a TBATS model to the data
fc <- forecast(fit, h = #h) #Forecast the series for the next 5 years
autoplot(fc) #Plot the forecasts

Seasonal ARIMA
- acf2(#TS, max.lag = #lag) #Plot sample P/ACF to lag # and compare to the true values
- sarima(#TS, p = #, d = #, q = #, P = #, D = #, Q = #, S = #) #Fit the seasonal model

diff(diff(#TS), lag = 12) #first diff for non-seasonal, second diff for seasonal lag of 12
sarima(#TS, p=2, d=1, q=0, P=0, D=1, Q=1, S=12) #first diff for non-seasonal, second diff for seasonal lag of 12

Box.test(#TS, lag = #lag, fitdf = 0, type = "Ljung") #p > 0.05 for stationarity
Cross-validation
- e <- tsCV(#TS, forecastfunction = #model, h = 6) # Compute cross-validated errors for up to 6 steps ahead
- mse <- colMeans(e^2, na.rm = TRUE) # Compute the MSE values and remove missing values
- data.frame(h = 1:6, MSE = mse) %>%
- ggplot(aes(x = h, y = MSE)) + geom_point() # Plot the MSE values against the forecast horizon

Maps

gvisGeoMap(#data, locationvar = "#colname", numvar = "#colname", hovertext = "", options = list(width = "", height = "", dataMode = "#e.g. regions to plot region and not whole world", region = '#regional code e.g. US', colours = "#hex colour") #chloropleth map

contour(#data, main = "#e.g. Title", col = "#e.g. blue") #isopleth / contour map
filled.contour(#data, color.palette = terrain.colors or topo.colors or heat.colors, main = "Title") #isopleth / contour map filled with colours

install.packages("ggmap", dependencies = TRUE)
library(ggmap)
#map <- get_map("e.g. newyorkcity", zoom = #e.g. 12)
ggmap(#map, extent = "device", legend = "#e.g. topleft") +
#map +
stat_density2d(aes(x = LONGITUDE, y = LATITUDE, fill = ..level.., alpha = .. level), size = #e.g. 2, bins = #e.g. 4, data = collide, geom = "polygon")

gvisGeoChart(#data, locationvar = "Country", sizevar = "#col", options = list(displayMode="markers", width = #e.g. 400, height = #e.g. 200, markerOpacity = #e.g. 0.5, sizeAxis = "{maxSize: 'e.g. 35'}", colourAxis="{colors:['green', 'red'}")) #Bubble map

install.packages("maps")
library(maps)
map("#e.g. state")
for(i in #e.g. 1:50) { text(data$long[i], data$lat[i], data$col[i], adj = #e.g. 0.5) }

QGIS #Open source GIS programme

Operating System

print("#package") #list dir
getwd("#dir") #get working dir
setwd("#dir") #set working dir
list.files("#file name") #list files in dir
file.info("#file name") #list info of file
file.rename("#original file name", "#new file name") #rename file
file.remove("#file name") #delete file
file.copy("#original file name", "#new file name") #make a copy of file
file.path("#file name") #display file path
dir.create(file.path("#parent folder", "#sub-folder"), recursive=TRUE) #create sub-folder via recursive

Impute missing data

mice

Data validation

pROC

Data cleaning

e1071
tidyverse

Data manipulation

reshape / reshape 2

String manipulation

stringR - string manipulation
rebus - string manipulation

Date & Time

lubridate

Objective / Statistical binning

woeBinning

Data manipulation

dplyr

Data modelling

mars
caret - data regression

Data visualisation

ggplot2 - graph and chart
PCAmixdata
zoo - time series modelling

Random forest

rpart
Randomforest

Create applications

shiny

Google Sites

Report abuse