DST SERB High-End Workshop on application of statistics and machine learning in environmental research

(4th March-10th March, 2024)

The workshop is organized by the Indian Statistical Institute, Tezpur campus. Although, the course had run for a week, I and Prof. Sabyasachi Bhattacharya were engaged only in two days lecture. The main motto of this workshop is to refresh/update the knowledge of statistics in collection with the idea of Machine Learning. Now a days the analysis of any data sets require a good prior knowledge in any software. The most popular statistical software in this domain is the R software. The  main reason behind the popularity of this software is that it is free of cost and you can easily download this software from the provided link (R software). Apart from this another reason is the versatility i.e. this software is very much useful in every domain.

 Introduction with R software:

It was begun in the 1990s by Robert Gentleman and Ross Ihaka of the Statistics Department of the University of Auckland. Nearly 20 senior statisticians provide the core development group of the R language, including the primary developer of the original §language, John Chambers, of Bell Labs. It contains a large, coherent, integrated collection of intermediate tools for data analysis. The software is FREE and can be downloaded from  http://www.r-project.org/  or you can download from . The versatility of this software is that it can be used in every domain and the coding structure in this software is quite easy rather than the others. Moreover, R software is used not only for coding purpose, you can also prepare any manuscript or prepare any sorts of presentation in this software. R markdown will help in this case. The major yardstick of the software is the R packages. Basically, Packages are collections of R functions, data, and compiled code in a well-defined format. The directory where packages are stored is called the library. Currently, the CRAN package repository features 10964 available packages.https://cran.r-project.org/web/packages/. To install packages type install.packages(pkgs, dependencies=TRUE) in the console.

In most of my research work i have used this software. Moreover, for any sorts of discrepancy do not hesitate to contact me. The contact details is provided in my home page. Don't worry this page will be update as per your needs. 

References:

8th March 2024:

1. Basic Understanding of the sampling distribution with the help of R software:

set.seed(897)


############# Data


Marks = rnorm(1000, 65, 10)


Marks = round(Marks, 2)


Marks


#######graphical representation


plot(Marks, xlab="students no.", ylab="Marks")


######frequency distribution


par(mfrow=c(2, 2))


hist(Marks)


#v = seq(min(Marks), max(Marks), length.out = 25)


#hist(Marks, breaks = v)


#prob=T)

#lines(density(Marks, kernal="gaussian"))


##########sample of size 100


sample100=sample(Marks, 100)


sample100


######sample frequency distribution


hist(sample100, col="red")


##########sample of size 500


sample500=sample(Marks, 500)


sample500


######sample frequency distribution


hist(sample500, col="blue")


## Mean analysis 

mean(Marks)

mean(sample100)

mean(sample500)


##########sample of size 100 100 times


samplecomb=matrix(NA, 100, 100)


for (i in 1:100)

  

{

  

  samplecomb[i, ]=sample(Marks, 100)

  

}

dim(samplecomb)

samplecombrwmean=rowMeans(samplecomb)


hist(samplecombrwmean, col="green", main = "",

     xlab = "Class boundary based on repeated sample mean")


mean(Marks)

mean(sample100)

mean(sample500)

mean(samplecombrwmean)


########## More and more sample

set.seed(989)


sample10.mean = numeric(5)

for(i in 1:5)

{

  sample10=sample(Marks, 10)

  

  sample10.mean[i] =mean(sample10)

  

}

sample10.mean

#hist(sample10.mean)

mean(sample10.mean)


par(mfrow = c(2,2))

plot(sample10.mean)

abline(h = 65, col = "blue", lty = 2)

var(sample10.mean)

#############


sample25.mean = numeric(5)

for(i in 1:5)

{

  sample25=sample(Marks, 25)

  

  sample25.mean[i] =mean(sample25)

  

}

sample25.mean

plot(sample25.mean)

abline(h = 65, col = "blue", lty = 2)


var(sample25.mean)

#########


sample50.mean = numeric(5)

for(i in 1:5)

{

  sample50=sample(Marks, 50)

  

  sample50.mean[i] =mean(sample50)

  

}

sample50.mean

plot(sample50.mean)

abline(h = 65, col = "blue", lty = 2)


var(sample50.mean)

########################


sample75.mean = numeric(5)

for(i in 1:5)

{

  sample75=sample(Marks, 75)

  

  sample75.mean[i] =mean(sample75)

  

}

sample75.mean

plot(sample75.mean)

abline(h = 65, col = "blue", lty = 2)


var(sample75.mean)

2. Methodology to deal with small sample size

set.seed(897)


############# Data


Marks = rnorm(1000, 65, 10)


Marks = round(Marks, 0)


Marks


l = 1000

sample10.mean = numeric(l)

for(i in 1:l)

{

  sample10=sample(Marks, 10, replace = T)

  

  sample10.mean[i] =mean(sample10)

  

}

sample10.mean

hist(sample10.mean)

mean(sample10.mean)


##### Sampling scheme with replacement


sample10.mean.wr = numeric(l) 

for(i in 1:l)

{

  sample10.wr=sample(Marks, 10, replace = T)

  

  sample10.mean.wr[i] =mean(sample10.wr)

  

}

sample10.mean.wr

hist(sample10.mean.wr)


mean(sample10.mean.wr)

mean(sample10.mean)


#### Bootstrap 


x = c(1, 7, 2, 3, 4,9, 11) # sample vector


B = 1000 # Number of bootstrap samples to be generated


out = matrix(NA, nrow = B, ncol= length(x))


b_mean = numeric(B)


b_sd = numeric(B)


for(i in 1:B) {

  

  out[i,] = sample(x, size = length(x), replace = TRUE)

  

  b_mean[i] = mean(out[i,])

  

  b_sd[i] = sd(out[i,])

  

}


hist(b_mean)


mean(b_mean) # Bootstrap mean of sample mean

mean(x)


sd(b_mean) # Bootsrap standard error


b.ci.mean = quantile(b_mean, c(2.5, 97.5)/100)

3. Data structure in R software:

data = read.csv(file.choose())

View(data)

sbp = data$sbp

sm = summary(sbp)

sm


b1 = boxplot(sbp, horizontal = T)

b1$out


dbp = data$dbp

data.md = data.frame(sbp, dbp)

View(data.md)

boxplot(data.md)


mean(sbp)

var(sbp)

sd(sbp)


par(mfrow = c(2,2))


pos = which(sbp>100)

sbp.gt.100 = sbp[pos]

h2 = hist(sbp.gt.100, probability = T)

length(sbp.gt.100)

h2$breaks


break.md = c(100, 130, 150, 180, 200, 240)

h3 = hist(sbp.gt.100, breaks = break.md, probability = F)


h4 = hist(sbp.gt.100, probability = F)



pos1 = which(sbp<=100)

sbp.less.equal.100 = sbp[pos1]

h1 = hist(sbp.less.equal.100, probability = T)

length(sbp.less.equal.100)

h1$breaks


sbp

dbp


plot(sbp, dbp)

c = cor(sbp, dbp)

if(c>0.5)

  print("The correlation is strongly positive")


l = length(sbp)


x = numeric(l-1)

x

length(x)

for(i in 1:length(x))

  x[i] = log(sbp[i+1]) - log(sbp[i])

x

l


cor(x, dbp)

cor(x, dbp[-1])

cor(sbp, dbp)


par(mfrow = c(2, 2))


plot(x, dbp[-1])

plot(sbp, dbp)

plot(sbp, dbp, col = "red")

plot(sbp, dbp, col = "red", lwd = 2)

plot(sbp, dbp, col = "red", lwd = 2, 

     xlab = "Systolic Blood Pressure", ylab = "Diastolic Blood Pressure")


plot(sbp, dbp, col = "red", lwd = 2, 

     xlab = "Systolic Blood Pressure", 

     ylab = "Diastolic Blood Pressure", main = "Information on Blood Pressure")


plot(sbp, dbp, col = "red", lwd = 2, 

     xlab = "Systolic Blood Pressure", 

     ylab = "Diastolic Blood Pressure", 

     main = "Information on Blood Pressure", cex.lab = 1.3)

axis(1, font = 2) ## x axis

axis(2, font = 2) ## yaxis


plot(sbp, dbp, col = "red", lwd = 2, 

     xlab = "Systolic Blood Pressure", 

     ylab = "Diastolic Blood Pressure", 

     main = "Information on Blood Pressure", 

     cex.lab = 1.3, cex = 2)


plot(sbp, dbp, col = "red", lwd = 2, 

     xlab = "Systolic Blood Pressure", 

     ylab = "Diastolic Blood Pressure", 

     main = "Information on Blood Pressure", 

     cex.lab = 1.3, cex = 2, pch = 19)


m = sbp^2

fit.linear = lm(dbp~sbp)

summary(fit.linear)


lines(sbp, predict(fit.linear), lwd = 5)

a.hat = as.numeric(coef(fit.linear)[1])

b.hat = as.numeric(coef(fit.linear)[2])


print(a.hat)

print(b.hat)


resd = residuals(fit.linear)

plot(resd)

abline(h = 0, col = "blue", lwd = 3)


 9th March 2024:

## Access the data through the file "anova_data_final" 

## You are not allowed to provide the path manually 

data = read.csv(file.choose())

View(data)

stars(data)


## Extract three groups data 

group1 = data[,1]

group2 = data[,2]

group3 = data[,3]


## Draw the histogram for each of the groups and display in the same graphics page

par(mfrow = c(2,2))

hist(group1)

hist(group2)

hist(group3)


### Generate another dataset with name "group4" by adding the group1 data and 

### the random sample from normal with mean 48 and standard deviation 1


### Extract the group 4 data

## Draw the histogram of group4 data


#### Regression ###

group4 = group1+rnorm(length(group1), 48, 1)

hist(group4)


## Boxplot 

boxplot(group4)


#### Apply suitable normality test by both visual and statistical diagonastics

### for group1, group2, group3, group4


qqnorm(group1)

qqnorm(group2)

qqnorm(group3)

qqnorm(group4)



shapiro.test(group1)

shapiro.test(group2)

shapiro.test(group3)

shapiro.test(group4)


## Do the scatterplot of group1 vs group4

plot(group1, group4)


## Find the correlation between the group1, group4 and group1, group3.

## Do the correlation test

## Standardized each of the group1 and group4 observations by 0.5 and

## find the correlation coefficient of these Standardized data

## Check whether the correlation coefficient of the original and Standardized variable 

## are same or not


cor(group1, group4)

cor(group1, group3)


cor.test(group1, group4)

cor.test(group1, group3)


plot(group1, group4)


group11 = group1/0.5

group41 = group4/0.5


plot(group11, group41)


plot(group1, group4)


cor.test(group11, group41)


### Fit a linear regression by considering group4 as the response and 

### group1 as the explanatory variable


fit.lm = lm(group4~group1)

summary(fit.lm)


new.group1 = group1^2

new.group2 = group1^3


fit.new = lm(group4~group1+new.group1+new.group2)

summary(fit.new)



fitted.values(fit.lm)

rs1 = group4 - fitted.values(fit.lm)

plot(rs1)

## Interpret the p-values for the estimated regression parameters

## Also interpret the significance of R^2.


summary(fit.lm)


### plot the residuals. 

## Does the residuals distribution follow normal? Do necessary test

rs = residuals(fit.lm)

plot(rs)

abline(h =0, col = "red")

hist(rs)

qqnorm(rs)

shapiro.test(rs)


### Apply suitable test for testing the equality of the population mean 

### of group1 and group4. 

t.test(group1, group4)



## Testing of variance ##

#library(lmtest)

#bptest(group1~group4)


bartlett.test(list(group1, group4))


## Non-parametric sign test (This is requested by Mr. Vishnu, one of the enthusiatic participant)

## Wilcoxon-Rank Sign test for the non-parametric case.


s1 = rexp(n = 1000, rate = 1/65)

s1

mean(s1)


s2 = rexp(n = 1000, rate = 1/75)

s2

mean(s2)


shapiro.test(s1)

shapiro.test(s2)


wilcox.test(s1, s2, paired = F)