setwd("C:\\") #set working directory (Note: Need to use \\)
test=read.csv( "file.csv" , sep="," , header = TRUE, col_types="#c for character, d for double, - for not to read col", n_max="#number of rows to read") #read files in csv (comma) / .csv2 (tab) / tsv (tab)
x=read.table("#dir", sep=",", header = TRUE) #read table with sep="," or "\t" or "@" and import data
Exporting data
write.table(x,"file.csv",sep=",") #export data with .csv or .xls or .txt
Data Types
Things to look out
Format and dimension of the data
Variable names and how they are stored
Missing data? Flawed data?
Data types
x=1 is scalar, y=c(1,2,3) is vector, z = matrix(c(), nrow, ncol)
z[] --> Select all
z[x,y] --> Select specific row x and col y
z[c[x1,x2], c[y1,y2]] --> Output of 4 elements (x1y1, x1y2, x2y1, x2y2)
x =c(#e.g. 1:5) #numeric vector of 1 to 5
x = list (c(#e.g. 1:5),"e.g. Hello World") #list
#as.numeric(x) #as.logical(x) #as.character(x) #as.complex(x) #change type
object.size(#matrix) #file size
#list = c(#data, stringsAsFactors=FALSE) #prevent R from converting column to string
List all variables
ls()
Assigning values
x[e.g. 1]=3
x[e.g. 1,2]=3 #add 3 to row 1 col 2
Referencing
with(#matrix, function) #No need to repeat name of table
attach(#df) #reference to data frame so that no need to repeat
| #evaluate ALL elements with OR (i.e. at least 1 element is true)
|| #evaluate first element with OR (i.e. at least 1 element is true)
xor() #exclusive OR, i.e. xor(FALSE, TRUE)=TRUE, xor(TRUE, TRUE)=FALSE
isTRUE(x > y)
which(x) #list element number of TRUE
any(x) #check if any element meets the criterion
all(x) #check if all elements meet the criterion
Test for finite / infinite data
is.finite(x)
is.infinite(x)
Error
NA: Not available (i.e. missing data)
NaN: Not a number error (e.g. 0/0)
Inf
Test for error cases
is.na(x) #is.na(x) is not the same as x==NA
is.nan(x)
anyNA(x) #test if there is any NA in array
complete.cases(x) #test for opposite of na
#mean(#df, na.rm = TRUE) #calculate with NA removed (values still stay)
na.omit(#df) #remove rows with NA
clean<-complete.cases(x); x[clean] #remove NA
Mathematical Operations
Mathematical Operations
% #modulus
sqrt()
abs()
round()
factorial()
sample(#range of values e.g. c(0,1), #number of toss, replace = TRUE, #skewed probability e.g. c(0.3, 0.7)) #random numbers with replacement after each toss
unique(x) #unique values
User-defined operations
"%#var_name%" <- function()
Matrix
Creation of matrix
x=matrix(c(e.g. 1:10),nrow= e.g. 2,ncol= e.g. 5) #create matrix
x=data.frame(x=c(e.g. 1:4),y=c("e.g. Hello","e.g. World")) #create matrix of different data types
x=matrix(rep(#value,#number_of_repetition),ncol=e.g. 2,nrow=e.g. 3) #create repeated values in matrix
replicate(#number of replicates, function) #repeat function
Apply
apply(#matrix, #1 for col or 2 for row, #function) #function loop
lapply(#matrix, #function) #function loop for list
lapply(#matrix, #function(x) x[n]) #function loop for list - return element n
sapply(#matrix, #function) #function loop for list in a simplified one-liner
vapply(#matrix, #function, #datatype e.g. numeric(1)) #function loop for list in a simplified one-liner (safer than sapply by specifying the type of output expected)
as.character(class(#matrix)) #return type of var in matrix in a list
tapply(#datatable$colname, #datatable$groupbycolname, #function) #function loop for vector and group by category
Nested function
return (e.g. fx)
Probability functions
r*** for "random"
rbinom (#number of variable 1, #number of flips size = e.g. 100,prob = e.g. 0.7) #binomial distribution
rnorm(#number of variable e.g. 10,mean = e.g. 100,sd = e.g. 25) #normal distribution
rpois(#number of variable e.g. 10, #mean) #Poisson distribution
rexp() #exponential
rchisq() #chi-squared
rgamma() #gamma
d*** for "density"
p*** for "probability"
q*** for "quantile"
Text operations
Concatenation
paste(x, collapse = "#character to add in between text of entire result") #concatenate
paste("#value1", "#value2", sep = "#character to add in between text of each term") #concatenate
paste("#value1", "#value2", sep = "#character to add in between text") #concatenate first element of vector 1 with that of vector 2 before next element
paste0("#value1", "#value2") #concatenate without separator
paste("#statement", #vector) #print each statement with an element
Formatting and counting
names(x) #display text
nchar(x) #count number of characters
toupper(x) #UPPERCASE
tolower(x) #lowercase
str_to_title(x) #sentence case
str_trim("#string") #trim whitespace from the ends of the string (Note: Not inside)
LETTERS #all alphabetical characters in vector
Search
grepl(#search value, #string) #search if string contains search value
grepl("#start . #end", #string) #search if string contains search value beginning with #start and ending with #end
grepl("(#search value) {#min adjacent repeat occurrence, #max adjacent repeat occurrence}", #string) #search if string contains search value (Note: search value can be (x.{#occurrence}) {#occurrence}
word("#string", #position of word to extract) #extract word from sentence
Metacharacter
. Any Character
\\w #A Word (letter, digit, underscore)
\\W #Not a Word
\\d #A Digit (0 to 9)
\\D #Not a Digit
\\s #Whitespace (line breaks, tabs, spaces)
\\S #Not Whitespace
\\#special characters #Special characters
[xyz] #A Set of Characters
[^xyz] #Negation of Set (i.e. all except xyz)
[a-z] #A Range of Characters from a to z
^ #Beginning of String
$ #End of String
\n #Newline
+ #One or More of Previous
* #Zero or More of Previous
? #Zero or One of Previous
| #Either the Previous or the Following
{5} #Exactly 5 of Previous
{2, 5} #Between 2 and 5 or Previous
{2, } #More than 2 of Previous
x|y #search for x or y
E.g.
#matrix( grepl(#start_end_with_vowel "^[AEIOU]{1}.+[aeiou]{1}$", #matrix) ) #display list of words that starts with capitalized vowels and end with lowercase vowels
Substitution
sub("[#search value]", "#new value", #vector) #substitute first instance
gsub("[#search value]", "#new value", #vector) #substitute ALL instances
str_pad("#text",width = #total length of string e.g. 8, side = "#adding char from e.g. left", pad = "e.g. -") #add char - from the left so that total length is e.g. 8
Splitting and extracting strings
strsplit(#vector, "#search value")
str_extract("#vector", "#search value e.g. [0-9]+")
Arranging strings in order
str_order(#vector)
Unpacking arguments
args <- list(...)
[Cont'd] x<- args[["x"]]
Date and Time
Time is wrt 1970-01-01
Sys.Date() #YYYY-MM-DD (Date)
Sys.Time() #YYYY-MM-DD HH:MM:SS +UTC (POSIXct or POSIXlt)
x$min #display min
strptime(x, "%B %d, %Y %H:%M") #convert time
difftime(Sys.time(), x, units = #type of time passed 'days') #difference between time
install.packages("portfolio") #tree map in map.market()
install.packages("ggogleVis") #interactive plot
gVisScatterChart
gvisBarChart
gvisLineChart
gvisTimeline
gvisHistogram
gvisBubbleChart
gvisTreeMap
gvisGeoMap
gvisMotionChart
brewer.pal(#number of shading, "#e.g. Greens")) #colour palette
Formatting
#data = #data[order(#data$col, decreasing = TRUE)] #order data
par(bg = "e.g. white", las = #axis label_type e.g. 1, col.lab = "#col label colour e.g. black", col.axis = "#col axis colour e.g. white", bty = "e.g. n", cex.axis = #axis size e.g. 0.9, cex.lab = #label size e.g. 1.5, mar = c(#margin for bottom, #left, #top, #right))
par(mfrow = c(#row,#col), mar = c(#bottom, #left, #top, #right)) #divides plot area into rows and columns with margins
gvisMerge(#graph1, #graph2, horizontal = TRUE) # Display graphs side by side
sysfonts::windowsFonts(#font = windowsFont("#font name)) and par(family="#font") #Change font
xkcd #xkcd theme
Scatterplot
plot(x=#data$colname for x axis, y=#data$colname for y axis, col = rainbow(#colours) OR col = c(#colour based on group), pch = #?points type of point coloured circle 20, xlim=c(0,max(#data$colname for max x-value)), type = "o" (to connect points), main = "#title", sub= "#subtitle under x-axis", xlab = "#x-axis label", ylab ="#y-axis label")
legend("topright",cex = #size e.g. 0.6, fill = c("#colour e.g. red for var1","#colour e.g. black for var 2"), legend = c("#var1" ,"#var2"))
abline(h=#value or v=#value, lwd = #line width e.g. 1, lty = #type of line e.g. 4 for dotted, col = "e.g. red") # create horizontal or vertical line
text(#data$colname, pos = e.g. 2, offset = e.g. 0) #add text to graph
title(main = "#title") #add title to graph
scatter.smooth(#data$colname for x axis, #data$colname for y axis, pch = type of point e.g. coloured circle 20, lpars = list(lty = e.g. 3, col ="e.g. black", lwd = e.g. 2)) #smooth Locally Weighted Scatterplot Smoothing line
Histogram
hist(#matrix$col) #histogram
rug(#matrix$col) #histogram with density of tick marks
Barchart
barplot(data, main="#title", xlab="#x-axis label", col=c("#barcolour e.g. darkblue","#barcolour e.g. red", "#barcolour e.g. yellow"), beside=#stacked FALSE or side-by-side TRUE, legend = rownames(data))
Boxplot
boxplot(formula = #y ~ #x, data = #data, col = "colour e.g. red")
Dendrogram
#d = dist(#dataframe, method = "euclidean")
hclust(#d, "ave") #hierarchical clustering for no prior knowledge of underlying cluster
data$col1 [data$col2 == ""] = NA #Assign NA for row without relevant data
Pie Chart
pie(round (#data / sum(#data)) * 100), col = e.g. #colour, radius = #e.g. 1, init.angle = #e.g. 90, clockwise = TRUE, labels = paste(c(#labels), "#e.g. %"), main = "#e.g. Title")
install.packages("plotrix")
library(plotrix)
floating.pie(#x coordinate for centre of pie e.g.3, #y coordinate for centre of pie e.g. 3, #e.g. pie data, radius = #e.g. 1, col = #colour, startpos = #starting position in radians e.g. 4)
pie.labels(#x coordinate for centre of pie e.g.3, #y coordinate for centre of pie e.g. 3, #angles e.g. pie chart, radius = #less than 1 to be plotted inside e.g. 0.4, labels = #e.g. labels)
legend("#e.g. left", fill = "#e.g. colour", legend = "#e.g. legend")
draw.circle(#x coordinate for centre of pie e.g.3, #y coordinate for centre of pie e.g. 3, radius = #e.g. 0.5, col = "#e.g. white") #additional code for donut chart
Fan Chart
library(plotrix)
fan.plot(round (#data / sum(#data)) * 100), col = e.g. #colour, max.span = pi, align = "#e.g. left, labels = paste(c(#labels), "#e.g. %"), main = "#e.g. Title")
Slope Chart
library(plotrix)
bumpchart(#data in e.g. columns 3 and 4 [, 3:4], lwd = "#line width e.g. 2", col = ("#hex"), top.labels = NA, rank = FALSE)
boxed.labels (#x coordinate e.g.1, #y coordinate e.g. 11, labels = c(#col), col = "e.g. blue", border = FALSE)
boxed.labels (#x coordinate e.g.2, #y coordinate e.g. 11, labels = c(#col), col = "e.g. red", border = FALSE)
3D Plots
plot3D::scatter3D() #3D scatterplot
rgl::plot3D() #interactive 3D scatterplot
plotrix::pie3D() #3D piechart
plot3D::hist3D() #3D histogram
plot3D::ribbon3D() #3D ribbon plot
plot3D::contour3D() #3D contour plot
plot3D::image2D() and persp3D() #3D contour and flat surface plot
plotrix3D::surf3D() #3D surface plot
animation::saveGIF() #animation of plot
Multidimensional Plots
HistData::sunflowerplot() #Plot density of data as sunflower diagram in scatterplot
hexbin::hexbin() #Plot density of data as hexagon in scatterplot
googleVis::gvisCalendar() #Plot density of data in calendar
plotrix::pyramid.plot() #Horizontal bar plots (e.g. gender differences)
corrplot::cor() and corrplot(method = c("#e.g. ellipse / number / square"), order = "#e.g. hclust", addrect = #number of clusters e.g. 4,rect.col = "#e.g. blue")) #Correlation Plot with Clusters
ISLR::lm() #Regression Line
ISLR::boxplot() #Box and Whiskers Plot
quantmod::qqline() and qqmod() with shapiro.test() #Quantile-Quantile Plot (e.g. deviation from normal distribution)
ISLR and vioplot::vioplot() #Violin Plot
ts() and decompose() #Decomposed Time Series
acf() #Correlogram (Auto Correlation Function ACF Plot or Autocorrelation plot)
forecast and fanplot::auto.arima(), simulate(), and fan() #Forecast using Autoregressive Integrated Moving Average Model (ARIMA)
quantmod::chartSeries() #Candlestick Plot (e.g. sock prices for opening, closing, high and low)
quantmod::lines() with hist() #Density Plot in Histogram
Word Cloud and Text Mining
wordcloud::wordcloud(#text, min.freq=#plot all words using e.g. 1, scale = c(#e.g. 2, 0.5), random.color = #e.g. TRUE, colour = #e.g. pal) #Word Cloud
tm::Corpus(DirSource("#dir")) #Open text files in directory
tm::tm_map() #Text mining tool to remove numbers, punctuations etc
tm::findFreqTerms(#data, #frequency e.g. 14) #Text mining tool to list words that appear n times
tm::findAssocs(#data, c(#word1, #word2), #correlation e.g. 0.5) #Text mining tool to list words that are related to terms with lower correlation limits
XML::xmlParse(#data), xpathSApply(#data, "//#value", xmlValue)#Parse XML data
Time Series
autoplot(#ts, facet = TRUE) #Plot time series
Trends induce positive correlations in the early lags.
Seasonality will induce peaks at the seasonal lags.
Cyclicity induces peaks at the average cycle length.
Types of forecasting
naive(#TS, #forecast length) #use most recent observation
snaive(#TS, #forecast length) #seasonal
Simple exponential smoothing
fc <- ses(#TS, h = 10) #Use ses() to forecast the next 10 years
summary(fc) #Use summary() to see the model parameters
autoplot(fc) + autolayer(fitted(fc)) # Add the one-step forecasts for the training data to the plot
Holt's trend method
fcholt <- holt(#TS, damped=FALSE, h = 10) #Produce 10 year forecasts using holt()
summary(fcholt) #Look at fitted model using summary()
autoplot(fcholt) #Plot the forecasts
checkresiduals(fcholt) #Check that the residuals look like white noise
Holt-Winter's trend, seasonality method
fcholt <- holt(#TS, damped=FALSE, h = 10) #Produce 10 year forecasts using holt()
summary(fcholt) #Look at fitted model using summary()
autoplot(fcholt) #Plot the forecasts
checkresiduals(fcholt) #Check that the residuals look like white noise
fc <- hw(#TS, seasonal = "multiplicative", h = 3) #Produce 3 year forecasts
checkresiduals(fc) #Check if residuals look like white noise
autoplot(fc) #Plot the forecasts
Box-Cox transformation
BoxCox.lambda(#TS) #find out lambda value for transformation
auto.arima()
farima <- function(x, h) {
forecast(auto.arima(x), h=h)
} #Set up forecast functions forARIMA models
e1 <- tsCV(#TS, farima, h=1) #Compute CV errors for ARIMA
mean(e1^2, na.rm = TRUE) #Find MSE of each model class
#TS%>% farima(h=#forecast)%>% autoplot() #Plot forecasts using the best model class
Dynamic Regression
autoplot(#TS[, c(#col1, #col2)], facets = TRUE) #Time plot of both variables
xreg <- cbind(#colname = #TS[, "#col"]) # Matrix of regressors
fit <- auto.arima(#TS[, "#col"], xreg = xreg, stationary = TRUE) #Fit ARIMA model with stationary data
coef <- coefficients(fit)['xreg'] #Check model for coefficient of regression
fc <- forecast(fit, xreg = cbind(#xreg1, #xreg2) OR rep(#value, #rep)) #Forecast fit as fc
autoplot(fc) + xlab("#X") + ylab("#Y") #Plot fc with x and y labels
Dynamic Harmonic Regression
harmonics <- fourier(#TS, K = #) #Set up harmonic regressors of order #
fit <- auto.arima(#TS, xreg = harmonics, seasonal = FALSE) #Fit regression model with ARIMA errors
newharmonics <- fourier(#TS, K = #, h = #forecast length) #Forecasts next h years
fc <- forecast(fit, xreg = newharmonics)
fc %>% autoplot() #Plot forecasts fc
Multiple seasonality
fit <- tslm(#TS ~ fourier(#TS, K = c(#, #))) #Fit a harmonic regression using order # for each type of seasonality
fc <- forecast(fit, newdata = data.frame(fourier(taylor, K = c(#, #), h = #h))) #Forecast 20 working days ahead
autoplot(fc) #Plot the forecasts
checkresiduals(fit) #Check the residuals of fit #Can use forecast even if the test fails
Daily booking
xreg <- fourier(#TS, K = c(#daily period, #weekly period order)) #Set up the xreg matrix
fit <- auto.arima(#TS, xreg = xreg, seasonal = FALSE, stationary = TRUE) #Fit a dynamic regression model
checkresiduals(fit) #Check the residuals
c <- forecast(fit, xreg = fourier(#TS, c(#daily, #weekly), h = #h)) #Plot forecasts for working days ahead
autoplot(fc) #Plot It is often unrealistic to have residuals that pass the tests for such long series. The effect of the remaining correlations on the forecasts will be negligible.
TBATS
Trigonometric terms for seasonality
Box-Cox transformations for heterogeneity
ARMA errors for short-term dynamics
Trend (possibly damped)
Seasonal (including multiple and non-integer periods)
autoplot(#TS) #Plot the data
fit <- tbats(#TS) #Fit a TBATS model to the data
fc <- forecast(fit, h = #h) #Forecast the series for the next 5 years
autoplot(fc) #Plot the forecasts
Seasonal ARIMA
acf2(#TS, max.lag = #lag) #Plot sample P/ACF to lag # and compare to the true values
sarima(#TS, p = #, d = #, q = #, P = #, D = #, Q = #, S = #) #Fit the seasonal model
diff(diff(#TS), lag = 12) #first diff for non-seasonal, second diff for seasonal lag of 12
sarima(#TS, p=2, d=1, q=0, P=0, D=1, Q=1, S=12) #first diff for non-seasonal, second diff for seasonal lag of 12
Box.test(#TS, lag = #lag, fitdf = 0, type = "Ljung") #p > 0.05 for stationarity
Cross-validation
e <- tsCV(#TS, forecastfunction = #model, h = 6) # Compute cross-validated errors for up to 6 steps ahead
mse <- colMeans(e^2, na.rm = TRUE) # Compute the MSE values and remove missing values
data.frame(h = 1:6, MSE = mse) %>%
ggplot(aes(x = h, y = MSE)) + geom_point() # Plot the MSE values against the forecast horizon
Maps
gvisGeoMap(#data, locationvar = "#colname", numvar = "#colname", hovertext = "", options = list(width = "", height = "", dataMode = "#e.g. regions to plot region and not whole world", region = '#regional code e.g. US', colours = "#hex colour") #chloropleth map
contour(#data, main = "#e.g. Title", col = "#e.g. blue") #isopleth / contour map
filled.contour(#data,color.palette = terrain.colors or topo.colors or heat.colors, main = "Title") #isopleth / contour map filled with colours