Introduction to R

Course Outline

1) History

R is a programming language for statistical computing and graphics, supported by the R Core Team and the R Foundation for Statistical Computing. Created by statisticians Professor Ross Ihaka and Robert Gentleman (1993), R is used among data miners, bioinformaticians, and statisticians for data analysis and developing statistical software. A sizable number of extension packages that contain reusable code and documentation complement the fundamental R language.

RStudio is an integrated development environment for R. It is available in two formats: RStudio Desktop is a regular desktop application while RStudio Server runs on a remote server and allows accessing RStudio using a web browser. The RStudio IDE is a product of Posit PBC (formerly RStudio PBC, formerly RStudio Inc.)

2) Installation R

Step 1: Go to The Comprehensive R Archive Network (CRAN link: https://cran.r-project.org/)

Step 2: Click “Download R for Windows” > “base” > “Download R for Windows” (as 20 Nov 2023 is version 4.3.1)

Step 3: Save As and start the installation process

Installation RStudio

Step 1: Kindly visit RStudio website (Link: https://posit.co/download/rstudio-desktop/)

Step 2: Click "Download RStudio Dekstop for Windows"

Step 3: Save As and Run the application.

4) RStudio User Interface

5) Install R Package

Install multiple packages

install.packages("fpp")               #Data for "Forecasting: principles and practice"

install.packages("forecast")       #Forecasting Functions for Time Series and Linear Models

install.packages("ggplot2")        #Create Elegant Data Visualisations


install.packages("tidyverse")      #Collection of R packages designed for data science

install.packages("caret")             #Classification and Regression Training

install.packages("mlbench")       #Machine Learning Benchmark Problems

install.packages("MASS")            #Support Functions and Datasets for Venables and Ripley's MASS

6) Load R Package

library(fpp)

library(forecast)

library(ggplot2)


library(tidyverse)

library(caret)

library(mlbench)

library(MASS)

7) Create Variable and Data

cgpa <- c(3.65, 2.56, 3.00, 2.80, 3.80)

jamBelajar <- c(4, 8, 1, 2, 5)

jantina <- c("lelaki", "lelaki", "perempuan", "perempuan", "lelaki")

8) Qualitative Variable

jantina <- c("lelaki", "lelaki", "perempuan", "perempuan", "lelaki")           #example of qualitative variable

is.factor(jantina)                                                                     #to check if a variable is a qualitative or categorical variable

Result: [1] FALSE

jantina <- as.factor(jantina)                                                  #as.factor() function is used to convert a variable to qualitative variable.

is.factor(jantina)

Result: [1] TRUE

9) Create Datasets

dataset1 <- data.frame(cgpa, jamBelajar,jantina)

10) Descriptive Analysis

mean(cgpa)

mean(jamBelajar)

median(cgpa)

median(jamBelajar)

table(jantina)                #table() function can be used for qualitative variable, produce Contigency Table


summary(dataset1)      #summary() function can produce output such as minimum, maximum, mean, median (Q2), Q1 and Q3.

Descriptive Analysis (Advanced)

There are several other R packages that can be used to produce statistical outputs such as:


install.packages("pastecs")

library(pastecs)

stat.desc(dataset1, norm = TRUE)


source: 


11) Graphic (Barplot)

barplot(table(jantina),                          #table() is mandatory

        legend.text = TRUE,                       #legend.text = is for legend of barplot

        col = c("turquoise","pink"),            #col = is for assign color for bars

        main = "Barplot for Jantina",        #main = is barplot title

        ylab = "Jantina",                            #ylab = is lable of y-axis

        xlab = "Bilangan")                          #xlab = is lable of x-axis

Graphic (Histogram)

hist(jamBelajar, 

     col = "grey",                                            #col = is for assign color for bars

     main = "Histogram of Jam Belajar",    #main = is histogram title

     xlab = "Jam",                                          #xlab = is lable of x-axis

     ylab = "Bilangan")                                  #ylab = is lable of y-axis

Graphic (Boxplot)

Example 1

boxplot(cgpa, 

        main = "Box Plot for CGPA",               #main = is boxplot title

        xlab = "CGPA")                                     #xlab = is lable of x-axis


Example 2 (Require Real Estate Sales dataset - Link: https://app.box.com/s/71zssj13ry01ypyfiscm)

boxplot(dataset1$cgpa~dataset1$jantina, 

        main = "Boxplot CGPA vs Gender",    #main = is boxplot title

        xlab = "Gender",                                  #xlab = is lable of x-axis

        ylab = "CGPA")                                    #ylab = is lable of y-axis


Example 3

boxplot(Data$Sales~Data$Quality,                                  # data from Real Estate Sales

        main = "Boxplot Sales Price vs House Quality", 

        xlab = "Quality", 

        ylab = "Sales Price")

Graphic (Scatterplot)

Example 1

plot(Data$Sqft,                                                                # independent variable

     Data$Sales,                                                                 # dependent variable

     main = "Scatter Plot Sales Price vs Square Feet",   # title

     xlab = "Square Feet",                                                  # xlab = is lable of x-axis

     ylab = "Sales Price")                                                    # ylab = is lable of y-axis

Graphic (Q-Q plot)

Example 1

plot(Data$Sqft,                                                                # independent variable

     Data$Sales,                                                                 # dependent variable

     main = "Scatter Plot Sales Price vs Square Feet",   # title

     xlab = "Square Feet",                                                  # xlab = is lable of x-axis

     ylab = "Sales Price")                                                    # ylab = is lable of y-axis

12) Cross Tabulation

Example 1

table(Data$Quality, Data$Aircond)


Example 2 (used Package "summarytools")

install.packages("summarytools")      #package "summarytools": Tools to Quickly and Neatly Summarize Data

library(summarytools)

ctable(Data$Quality, Data$Aircond)  #ctable() function to produce Cross Tabulation


source: https://cran.r-project.org/web/packages/summarytools/index.html

13) Statistical test (Chi-square test of independence)

Example 1

ctable(x = Data$Quality,                  #qualitative variable 1
            y = Data$Aircond,                 #qualitative variable 2
            chisq = TRUE,                        # display results of Chi-square test of independence
            headings = FALSE)                # remove headings

Statistical test (Correlation Analysis - Two Quantitative variables)

Example 1

cor(Data$Sales,Data$Sqft)                  # cor() function = produce correlation coefficient, r