R Source Code

####################################################

# Chapter-by-chapter R code for Reasoning with Data

# by Jeffrey Stanton

# Version dated November 26, 2016

# Introduction

install.packages("modeest")

library("modeest")

#####################################

# Chapter 1

rainfall <- c(0,0,0,2,1,1,4) # Amount of rain each day this week

sum(rainfall) / length(rainfall) # Compute the mean the hard way

mean(rainfall) # Use a function to compute the mean

# Only if you did not do this in the Introduction

#install.packages("modeest") # Download the mode estimation package

#library(modeest) # Make the package ready to use

mfv(rainfall) # mfv stands for most frequent value

votes <- c(200,300,400) # Here is scenario one

(votes - mean(votes)) ^ 2 # Show a list of squared deviations

sum( (votes - mean(votes)) ^ 2) # Add them together

sum( (votes - mean(votes)) ^ 2) / length(votes) # Divide by the number of observations

votes1 <- c(200,300,400) # Here is scenario one again

sqrt( sum((votes1 - mean(votes1))^2) / length(votes1) ) # That is the standard deviation

votes2 <- c(299,300,301) # Here is scenario two

sqrt( sum((votes2 - mean(votes2))^2) / length(votes2) ) # And the same for candidate 2

# The commented code makes a high res plot when run in R

#png("Figure01_1.png", width = 6, height = 6, units = 'in', res = 300)

hist( rnorm(n=1000, mean=100, sd=10), main=NULL )

#dev.off()

# This shows a finer-grained histogram with 100 categories or “breaks”

# Try increasing the n and the breaks to see what happens

hist( rnorm(n=10000, mean=100, sd=10), breaks=100 )

#png("Figure01_2.png", width = 6, height = 6, units = 'in', res = 300)

hist(rpois(n=1000, lambda=1), main=NULL)

#dev.off()

mean(rpois(n=1000, lambda=1))

myPoiSample <- rpois(n=1000, lambda=1)

mean(myPoiSample)

hist(myPoiSample)

#####################################

# Chapter 2

table(rbinom(n=100,size=6,prob=0.5))

hist(rbinom(n=100,size=6,prob=0.5))

#png("Figure02_1.png", width = 6, height = 6, units = 'in', res = 300)

hist(rbinom(n=1000,size=6,prob=0.5), main=NULL)

#dev.off()

#png("Figure02_2.png", width = 6, height = 6, units = 'in', res = 300)

barplot(table(rbinom(n=1000,size=6,prob=0.5)), main=NULL)

#dev.off()

table(rbinom(n=1000,size=6,prob=0.5))/1000

#png("Figure02_3.png", width = 6, height = 6, units = 'in', res = 300)

barplot(table(rbinom(n=1000,size=6,prob=0.5))/1000, main=NULL)

#dev.off()

probTable <- table(rbinom(n=1000,size=6,prob=0.5))/1000

probTable

cumsum(probTable)

probTable <- table(rbinom(n=1000,size=6,prob=0.5))/1000

#png("Figure02_4.png", width = 6, height = 6, units = 'in', res = 300)

barplot(cumsum(probTable),main=NULL)

#dev.off()

probTable <- table(rbinom(n=10000,size=100,prob=0.5))/10000

barplot(probTable)

barplot(cumsum(probTable))

# Sidebar 2.1: Create Your Own Tables with R

toast <- matrix(c(2,1,3,4),ncol=2,byrow=TRUE) # Create a two column structure using the matrix() command

colnames(toast) <- c("Down","Up") # Label the columns

rownames(toast) <- c("Jelly","Butter") # Label the rows

toast <- as.table(toast) # Convert from metric to table

toast # Show the table on the console

margin.table(toast) # This is the grand total of toast drops

margin.table(toast,1) # These are the marginal totals for rows

margin.table(toast,2) # These are the marginal totals for columns

toastProbs <- toast/margin.table(toast) # Calculate probabilities

toastProbs # Report probabilities to console

#####################################

# Chapter 3

toastAngleData <- runif(1000,0,180) # Random numbers from uniform distribution

head(toastAngleData) # Look at the first few numbers in the list

tail(toastAngleData) # Look at the last few numbers in the list

hist(toastAngleData) # Plot a histogram of all 1000 data points

sample(toastAngleData,size=14,replace=TRUE)

mean(sample(toastAngleData,size=14,replace=TRUE))

replicate(4, mean(sample(toastAngleData,size=14,replace=TRUE)))

samplingDistribution <- replicate(10000,mean(sample(toastAngleData,size=14,replace=TRUE)))

#png("Figure03_1.png", width = 6, height = 6, units = 'in', res = 300)

hist(samplingDistribution, main=NULL)

#dev.off()

mean(samplingDistribution)

mean(toastAngleData)

summary(samplingDistribution)

quantile(samplingDistribution, c(.01, .05, .50, .95, .99))

#png("Figure03_2.png", width = 6, height = 6, units = 'in', res = 300)

hist(samplingDistribution, main=NULL)

abline(v=quantile(samplingDistribution,.01))

abline(v=quantile(samplingDistribution,.05))

abline(v=quantile(samplingDistribution,.95))

abline(v=quantile(samplingDistribution,.99))

#dev.off()

samplingDistribution[ samplingDistribution<= quantile(samplingDistribution,.01) ]

summary( samplingDistribution[samplingDistribution <= quantile(samplingDistribution,.01)] )

#####################################

# Chapter 4

# Descriptive statistics and box plot for mtcars

mean( mtcars$mpg[ mtcars$am == 0 ] ) # Automatic transmissions

mean( mtcars$mpg[ mtcars$am == 1 ] ) # Manual transmissions

sd( mtcars$mpg[ mtcars$am == 0 ] ) # Automatic transmissions

sd( mtcars$mpg[ mtcars$am == 1 ] ) # Manual transmissions

#png("Figure04_1.png", width = 6, height = 6, units = 'in', res = 300)

boxplot(mpg ~ am, data=mtcars, main=NULL) # Boxplot of mpg, grouped by am

#dev.off()

mean( sample(mtcars$mpg[ mtcars$am == 0 ],size=19,replace=TRUE) )

mean( sample(mtcars$mpg[ mtcars$am == 1 ],size=13,replace=TRUE) )

mean( sample(mtcars$mpg[ mtcars$am == 0 ],size=19,replace=TRUE) ) - mean( sample(mtcars$mpg[ mtcars$am == 1 ],size=13,replace=TRUE) )

meanDiffs <- replicate(100, mean( sample(mtcars$mpg[ mtcars$am == 0 ],size=19,replace=TRUE) ) - mean( sample(mtcars$mpg[ mtcars$am == 1 ],size=13,replace=TRUE) ))

#png("Figure04_2.png", width = 6, height = 6, units = 'in', res = 300)

hist(meanDiffs, main=NULL)

#dev.off()

quantile(meanDiffs, c(0.025, 0.975))

t.test(mtcars$mpg[mtcars$am==0] ,mtcars$mpg[mtcars$am==1])

install.packages("animation")

library(animation)

conf.int(level=0.95)

#####################################

# Chapter 5

# Don't forget to install JAGS first

# Visit: http://mcmc-jags.sourceforge.net

install.packages("BEST")

library(BEST)

carsBest <- BESTmcmc(mtcars$mpg[mtcars$am==0] ,mtcars$mpg[mtcars$am==1])

#png("Figure05_1.png", width = 6, height = 6, units = 'in', res = 300)

plot(carsBest, main=NULL)

#dev.off()

t.test(mtcars$mpg[mtcars$am==0] ,mtcars$mpg[mtcars$am==1])

install.packages("effsize")

library(effsize)

cohen.d(mtcars$mpg[mtcars$am==0] ,mtcars$mpg[mtcars$am==1])

set.seed(54321) # Control randomization

carsTdist <- rt(n=10000,df=18.332) # 10,000 random t values

hist(carsTdist) # Show in a histogram

lowTvalues <- carsTdist[carsTdist <= -3.7671] # Here is the lower tail

hiTvalues <- carsTdist[carsTdist >= 3.7671] # Here is the upper tail

length(lowTvalues) + length(hiTvalues) # The number of observations in the tails

# This is the t-command used at the end of the exercises

t.test(rnorm(100000,mean=17.1,sd=3.8),rnorm(100000,mean=17.2,sd=3.8))

#####################################

# Chapter 6

set.seed(1)

pgrp1 <- sample(precip,20, replace=TRUE)

pgrp2 <- sample(precip,20, replace=TRUE)

pgrp3 <- sample(precip,20, replace=TRUE)

var(c(pgrp1,pgrp2,pgrp3))

var(precip)

mean(pgrp1) # Examine the means of the three groups

mean(pgrp2)

mean(pgrp3)

barplot(c(mean(pgrp1),mean(pgrp2),mean(pgrp3))) # Create a bar plot of the means

var(c(mean(pgrp1),mean(pgrp2),mean(pgrp3))) # Variance among the means

pgrp3 <- pgrp3 - 5 # Take away five inches of rain from each point in sample 3

#png("Figure06_1.png", width = 6, height = 6, units = 'in', res = 300)

barplot(c(mean(pgrp1),mean(pgrp2),mean(pgrp3)), main=NULL) # Bar plot of the new means

#dev.off()

var(c(mean(pgrp1),mean(pgrp2),mean(pgrp3)))

var(c(pgrp1,pgrp2,pgrp3))

randomFs <- rf(n=100,df1=2,df2=57)

#png("Figure06_2.png", width = 6, height = 6, units = 'in', res = 300)

hist(randomFs, main=NULL)

#dev.off()

# Run ANOVA on groups sampled from the same population

set.seed(10) # Control the randomization

precipAmount <- sample(precip,60,replace=TRUE) # Enough for 3 groups of 20

precipGrp <- as.factor(rep(seq(from=1,to=3,by=1),20)) # Group designators, 3 groups

precipDF <- data.frame(precipAmount, precipGrp) # Put everything in data frame

#png("Figure06_3.png", width = 6, height = 6, units = 'in', res = 300)

boxplot(precipAmount ~ precipGrp, data=precipDF, main=NULL) # Get a box plot of the distribs

#dev.off()

precipOut <- aov(precipAmount ~ precipGrp, data=precipDF) # Run the ANOVA

summary(precipOut) # Provide an ANOVA table

# Uses the density function to show the shape of different F-distributions

fVals <- seq(from=0.01,to=5,by=0.01)

plot(fVals,df(fVals,df1=2,df2=57))

points(fVals,df(fVals,df1=3,df2=57))

points(fVals,df(fVals,df1=4,df2=57))

hist(precipOut$residuals) # Summarize the residuals

# This is the code for Sidebar 6.1

install.packages('gtools') #install gtools to get permutations

library(gtools) # Make the package ready

tinyPop <- c(1,2,3) # Here is a tiny population

allSamp <- permutations(n=3,r=3,v=tinyPop,repeats.allowed=T)

allSamp # Verify: 27 unique samples

apply(allSamp,1,var) # List the sample variance of each sample

mean(apply(allSamp,1,var)) # What is the mean of those variances

# Code for sidebar 6.2

install.packages("BEST")

library(BEST)

data(mtcars)

priorList <- list(muM = c(20,20), muSD = c(4,4))

carsBest2 <- BESTmcmc(mtcars$mpg[mtcars$am==0] ,mtcars$mpg[mtcars$am==1],priors=priorList)

plot(carsBest2, main=NULL)

# Code for Bayesian ANOVA

install.packages("BayesFactor")

library("BayesFactor")

precipBayesOut <- anovaBF(precipAmount ~ precipGrp, data=precipDF) # Calc Bayes Factors

mcmcOut <- posterior(precipBayesOut,iterations=10000) # Run mcmc iterations

#png("Figure06_4.png", width = 6, height = 6, units = 'in', res = 300)

plot(mcmcOut[,"mu"], main=NULL) # Show the range of values for the grand mean

#dev.off()

# Display a 95% HDI for precipitation group 1

#png("Figure06_5.png", width = 6, height = 6, units = 'in', res = 300)

par(mfcol=c(1,1))

hist(mcmcOut[,"precipGrp-1"], main=NULL)

abline(v=quantile(mcmcOut[,"precipGrp-1"],c(0.025)), col="black")

abline(v=quantile(mcmcOut[,"precipGrp-1"],c(0.975)), col="black")

#dev.off()

print("95% HDI lower bound: ")

print(quantile(mcmcOut[,"precipGrp-1"],c(0.025)))

print("95% HDI upper bound: ")

print(quantile(mcmcOut[,"precipGrp-1"],c(0.975)))

# Plot the posteriors for the three groups in a boxplot

#png("Figure06_6.png", width = 6, height = 6, units = 'in', res = 300)

boxplot(as.matrix(mcmcOut[,2:4]), main=NULL) # Boxplot the posteriors for the groups

#dev.off()

plot(mcmcOut[,1:6])

# Uncomment to show the other groups

# plot(mcmcOut[,"precipGrp-2"]) # Show the range of values for the second group

# plot(mcmcOut[,"precipGrp-3"]) # Show the range of values for the third group

precipBayesOut

# Analyze chickwt data that contains multiple groups with some mean differences

data(chickwts)

chicksOut <- aov(weight ~ feed, data=chickwts) # Run the ANOVA

summary(chicksOut)

chicksBayesOut <- anovaBF(weight ~ feed, data=chickwts) # Calc Bayes Factors

mcmcOut2 <- posterior(chicksBayesOut,iterations=10000) # Run mcmc iterations

#png("Figure06_7.png", width = 6, height = 6, units = 'in', res = 300)

par(mar=c(7,4,4,2),las=2) # Make a little more room for the labels

boxplot(as.matrix(mcmcOut2[,2:7]), main=NULL) # Boxplot the posteriors for the groups

par(mar=c(5,4,4,2),las=0) # Reset the margins

#dev.off()

summary(mcmcOut2)

# Sample post hoc comparison of sunflower and meatmeal

plot(BESTmcmc(chickwts[chickwts$feed=="sunflower",1],chickwts[chickwts$feed=="meatmeal",1]))

chicksBayesOut

#####################################

# Chapter 7

set.seed(12345)

wood <- rnorm(24)

heat <- rnorm(24)

mean(wood)

mean(heat)

sd(wood)

sd(heat)

#png("Figure07_1.png", width = 6, height = 6, units = 'in', res = 300)

plot(wood,heat, main=NULL)

#dev.off()

#png("Figure07_2.png", width = 6, height = 6, units = 'in', res = 300)

plot(wood,(wood-mean(wood)), main=NULL)

#dev.off()

cpWH <- wood * heat

#png("Figure07_3.png", width = 6, height = 6, units = 'in', res = 300)

hist(cpWH, main=NULL)

#dev.off()

mean(cpWH)

# Make a new, fake version of heat that will correlate with wood

newHeat <- wood/1.41 + heat/1.41 # Make a mixture of the two old variables

mean(newHeat) # What's the mean of our new heat variable?

sd(newHeat) # What's the sd of our new heat variable?

cpWnewH <- wood * newHeat

#png("Figure07_4.png", width = 6, height = 6, units = 'in', res = 300)

hist(cpWnewH, main=NULL)

#dev.off()

mean(cpWnewH)

#png("Figure07_5.png", width = 6, height = 6, units = 'in', res = 300)

plot(wood,newHeat, main=NULL)

#dev.off()

cor(wood,newHeat)

set.seed(12345) # Start with a random number seed

wood <- rnorm(2400) # Make two vectors of N=2400

heat <- rnorm(2400)

fireDF <- data.frame(wood, heat) # Put them in a dataframe

nrow(fireDF) # Verifying 2400 rows of two variables

fireDF[sample(nrow(fireDF), 24), ] # Generates one sample of n=24

cor(fireDF[sample(nrow(fireDF), 24), ])

cor(fireDF[sample(nrow(fireDF), 24), ])[1,2]

corDist <- replicate(5000,cor(fireDF[sample(nrow(fireDF), 24), ])[1,2])

#png("Figure07_6.png", width = 6, height = 6, units = 'in', res = 300)

hist(corDist, main=NULL)

#dev.off()

mean(corDist)

newHeat <- wood/1.41 + heat/1.41

newfireDF <- data.frame(wood, newHeat) # Put them in a dataframe

newcorDist <- replicate(5000,cor(newfireDF[sample(nrow(newfireDF), 24), ])[1,2],simplify=TRUE)

#png("Figure07_7.png", width = 6, height = 6, units = 'in', res = 300)

hist(newcorDist, main=NULL)

#dev.off()

mean(newcorDist)

# Conduct a null hypothesis test on one correlation

set.seed(12345)

wood <- rnorm(24)

heat <- rnorm(24)

cor.test(wood,heat)

cor.test(wood,(wood/1.41 + heat/1.41))

cor.test(iris[,"Sepal.Width"],iris[,"Petal.Width"])

install.packages("BayesFactor")

library("BayesFactor")

bfCorTest <- function (x,y) # Get r from BayesFactor

{

zx <- scale(x) # Standardize X

zy <- scale(y) # Standardize Y

zData <- data.frame(x=zx,rhoNot0=zy) # Put in a data frame

bfOut <- generalTestBF(x ~ rhoNot0, data=zData) # linear coefficient

mcmcOut <- posterior(bfOut,iterations=10000) # posterior samples

print(summary(mcmcOut[,"rhoNot0"])) # Get the HDI for r

return(bfOut) # Return Bayes factor object

}

set.seed(12345)

wood <- rnorm(24)

heat <- rnorm(24)

bfCorTest(wood,heat)

newHeat <- wood/1.41 + heat/1.41

bfCorTest(newHeat, wood)

bfCorTest(iris[,"Sepal.Width"],iris[,"Petal.Width"])

# Chi-square section

make2x2table <- function(ul) # The user supplies the count for the upper left cell

{

ll <- 50 - ul # Calculate the lower left cell

ur <- 30 - ul # Calculate the upper right cell

lr <- 50 - ur # Calculate the lower right cell

# Put all of the cells into a 2x2 matrix

matrix(c(ul,ur,ll,lr), nrow=2, ncol=2, byrow=TRUE)

}

make2x2table(15) # Should be like Table 7.2

make2x2table(0) # Should be like Table 7.3

make2x2table(30) # Should be like Table 7.4

calcChiSquared <- function(actual, expected) # Calculate chi-squared

{

diffs <- actual - expected # Take the raw difference for each cell

diffsSq <- diffs ^ 2 # Square each cell

diffsSqNorm <- diffsSq / expected # Normalize with expected cells

sum(diffsSqNorm) # Return the sum of the cells

}

# This makes a matrix that is just like Table 7.2

# This table represents the null hypothesis of independence

expectedValues <- matrix(c(15,15,35,35), nrow=2, ncol=2, byrow=TRUE)

calcChiSquared(make2x2table(15),expectedValues)

calcChiSquared(make2x2table(0),expectedValues)

calcChiSquared(make2x2table(30),expectedValues)

set.seed(12)

mean(rbinom(1000,30,prob=0.5))

#png("Figure07_8.png", width = 6, height = 6, units = 'in', res = 300)

hist(rbinom(1000,30,prob=0.5), main=NULL)

#dev.off()

chiDist <- replicate(100000,calcChiSquared(make2x2table(rbinom(n=1,size=30,prob=0.5)),expectedValues))

#png("Figure07_9.png", width = 6, height = 6, units = 'in', res = 300)

hist(chiDist, main=NULL)

#dev.off()

quantile(chiDist,probs=c(0.95))

calcChiSquared(make2x2table(20),expectedValues)

calcChiSquared(make2x2table(10),expectedValues)

# Run the chi-square test on Table 7.1 data

chisq.test(make2x2table(20), correct=FALSE)

# Run the chi-square test on Table 7.1 data

# data(Titanic) # Should not need this

badBoatMF <- ftable(Titanic, row.vars=2, col.vars="Survived")

badBoatMF

chisq.test(badBoatMF, correct=FALSE)

# Bayesian approach using BayesFactor package

ctBFout <- contingencyTableBF(make2x2table(20),sampleType="poisson",posterior=FALSE)

ctBFout

ctMCMCout <- contingencyTableBF(make2x2table(20),sampleType="poisson",posterior=TRUE,iterations=10000)

summary(ctMCMCout)

downProp <- ctMCMCout[,"lambda[1,1]"]/ctMCMCout[,"lambda[2,1]"]

#png("Figure07_10.png", width = 6, height = 6, units = 'in', res = 300)

hist(downProp, main=NULL)

#dev.off()

upProp <- ctMCMCout[,"lambda[1,2]"]/ctMCMCout[,"lambda[2,2]"]

#png("Figure07_11.png", width = 6, height = 6, units = 'in', res = 300)

hist(upProp, main=NULL)

#dev.off()

diffProp <- downProp-upProp

#png("Figure07_12.png", width = 6, height = 6, units = 'in', res = 300)

hist(diffProp, main=NULL)

abline(v=quantile(diffProp,c(0.025)), col="black")

abline(v=quantile(diffProp,c(0.975)), col="black")

#dev.off()

mean(diffProp)

badBoatMF <- ftable(Titanic, row.vars=2, col.vars="Survived")

ctBFout <- contingencyTableBF(badBoatMF,sampleType="poisson",posterior=FALSE)

ctBFout

ctMCMCout <- contingencyTableBF(badBoatMF,sampleType="poisson",posterior=TRUE,iterations=10000)

summary(ctMCMCout)

maleProp <- ctMCMCout[,"lambda[1,1]"]/ctMCMCout[,"lambda[1,2]"]

femaleProp <- ctMCMCout[,"lambda[2,1]"]/ctMCMCout[,"lambda[2,2]"]

diffProp <- maleProp - femaleProp

hist(diffProp)

mean(diffProp)

abline(v=quantile(diffProp,c(0.025)), col="black")

abline(v=quantile(diffProp,c(0.975)), col="black")

#####################################

# Chapter 8

set.seed(321)

hardwork <- rnorm(120)

basicsmarts <- rnorm(120)

curiosity <- rnorm(120)

randomnoise <- rnorm(120)

gpa <- hardwork/2 + basicsmarts/2 + curiosity/2 + randomnoise/2

sd(gpa)

#png("Figure08_1.png", width = 6, height = 6, units = 'in', res = 300)

plot(hardwork, gpa, main=NULL)

#dev.off()

#png("Figure08_2.png", width = 6, height = 6, units = 'in', res = 300)

plot(hardwork,gpa, main=NULL)

abline(a=0, b=0.56)

#dev.off()

arrows(min(hardwork),gpa[which.min(hardwork)],min(hardwork),min(hardwork)*0.56)

#png("Figure08_3.png", width = 6, height = 6, units = 'in', res = 300)

hist(gpa - (hardwork * 0.56), main=NULL)

#dev.off()

sum(gpa - (hardwork * 0.56))

calcSQERR <- function(dv, iv, slope)

{

(dv - (iv*slope))^2

}

head(calcSQERR(gpa,hardwork,0.56))

sum(calcSQERR(gpa,hardwork,0.56))

#png("Figure08_4.png", width = 6, height = 6, units = 'in', res = 300)

hist(calcSQERR(gpa,hardwork,0.56), main=NULL)

#dev.off()

sumSQERR <- function(slope)

{

sum(calcSQERR(gpa, hardwork, slope))

}

sumSQERR(0.56)

trySlopes <- seq(from=0, to=1, length.out=40)

sqerrList <- sapply(trySlopes, sumSQERR)

#png("Figure08_5.png", width = 6, height = 6, units = 'in', res = 300)

plot(trySlopes, sqerrList, main=NULL)

#dev.off()

educdata <- data.frame(gpa, hardwork, basicsmarts, curiosity)

regOut <- lm(gpa ~ hardwork, data=educdata)

summary(regOut)

regOut3 <- lm(gpa ~ hardwork + basicsmarts + curiosity, data=educdata)

summary(regOut3)

#png("Figure08_6.png", width = 6, height = 6, units = 'in', res = 300)

plot(randomnoise,regOut3$residuals, main=NULL)

#dev.off()

summary(residuals(regOut3))

cor(educdata)

regOutMCMC <- lmBF(gpa ~ hardwork + basicsmarts + curiosity, data=educdata, posterior=TRUE, iterations=10000)

summary(regOutMCMC)

#png("Figure08_7.png", width = 6, height = 6, units = 'in', res = 300)

hist(regOutMCMC[,"hardwork"], main=NULL)

abline(v=quantile(regOutMCMC[,"hardwork"],c(0.025)), col="black")

abline(v=quantile(regOutMCMC[,"hardwork"],c(0.975)), col="black")

#dev.off()

rsqList <- 1 - (regOutMCMC[,"sig2"] / var(gpa))

# length(rsqList) # Confirms 10000 R-squared estimates

mean(rsqList) # Overall mean R-squared is 0.75

#png("Figure08_8.png", width = 6, height = 6, units = 'in', res = 300)

hist(rsqList, main=NULL)

abline(v=quantile(rsqList,c(0.025)), col="black")

abline(v=quantile(rsqList,c(0.975)), col="black")

#dev.off()

regOutBF <- lmBF(gpa ~ hardwork + basicsmarts + curiosity, data=educdata)

regOutBF

# Real example using state.x77 data

stateData <- data.frame(state.x77)

stateOut <- lm(Life.Exp ~ HS.Grad + Income + Illiteracy,data=stateData)

summary(stateOut)

stateOutMCMC <- lmBF(Life.Exp ~ HS.Grad + Income + Illiteracy,data=stateData, posterior=TRUE, iterations=100000)

summary(stateOutMCMC)

rsqList <- 1 - (stateOutMCMC[,"sig2"] / var(stateData$Life.Exp))

mean(rsqList) # Overall mean R-squared

quantile(rsqList,c(0.025))

quantile(rsqList,c(0.975))

#hist(stateData$Illiteracy) # Histogram of raw data

#hist(stateOutMCMC[,"Illiteracy"]) # Posterior distribution of B weight

#boxplot(as.numeric(stateOutMCMC[,"Illiteracy"])) # Posterior distribution of B weight

stateOutBF <- lmBF(Life.Exp ~ HS.Grad + Income + Illiteracy,data=stateData)

stateOutBF

# Code for bonus homework item

set.seed(1)

betaVar <- scale(rbeta(50,shape1=1,shape2=10))

normVar <- rnorm(50)

poisVar <- scale(rpois(50,lambda=10))

noiseVar <- scale(runif(50))

depVar <- betaVar/2 + normVar/2 + poisVar/2 + noiseVar/2

oddData <- data.frame(depVar,betaVar,normVar,poisVar)

summary(lm(depVar ~ .,data=oddData))

#####################################

# Chapter 9

install.packages("HSAUR")

library("HSAUR")

data("weightgain", package = "HSAUR")

# Create an interaction plot

wg <- weightgain

#png("Figure09_1.png", width = 7, height = 6, units = 'in', res = 300)

interaction.plot(x.factor=wg$source,trace.factor=wg$type,response=wg$weightgain)

#dev.off()

aovOut = aov(weightgain ~ source + type + source:type, data=weightgain)

aovOut2 = aov(weightgain ~ source * type, data=weightgain)

# Code for Sidebar 9.1

testF <- rf(n=10000, df1=1, df2=36) # Generate random Fs for F(1,36)

hist(testF) # Display a histogram

abline(v=quantile(testF,c(0.95)),col="black") # Show threshold of significance, p<.05

quantile(testF,c(0.95)) # Report the threshold to the console

# Back to chapter 9 code. . .

aovOut3 = anovaBF(weightgain ~ source*type, data=weightgain)

aovOut3

aovOut3[4]/aovOut3[3] # What's the odds ratio of model 4 vs. model 3?

mcmcOut <- posterior(aovOut3[4],iterations=10000) # Run mcmc iterations

# summary(mcmcOut) # Review detailed posterior distributions

#png("Figure09_2.png", width = 7, height = 6, units = 'in', res = 300)

boxplot(as.matrix(mcmcOut[,2:5]), main=NULL) # Figure 9.2

#dev.off()

#png("Figure09_3.png", width = 6, height = 6, units = 'in', res = 300)

boxplot(as.matrix(mcmcOut[,6:7]), main=NULL) # Figure 9.3

#dev.off()

install.packages("gplots")

library("gplots")

#png("Figure09_4.png", width = 6, height = 6, units = 'in', res = 300)

plotmeans(weightgain ~ interaction(source,type,sep =" "), data=weightgain,connect=list(c(1,2),c(3,4)))

#dev.off()

# Show regression lines on a scatterplot of radiation vs. ozone

install.packages("lattice")

library(lattice)

data(environmental)

#png("Figure09_5.png", width = 6, height = 6, units = 'in', res = 300)

plot(environmental$radiation,environmental$ozone)

# This grey line shows what happens when we only consider

# the data with wind speeds above the median wind speed

hiWind <- subset(environmental, wind > median(environmental$wind))

hiLmOut <- lm(ozone ~ radiation,data=hiWind)

abline(hiLmOut,col="grey")

# This dotted black line shows what happens when we only consider

# the data with wind speeds at or below the median wind speed

loWind <- subset(environmental, wind <= median(environmental$wind))

loLmOut <- lm(ozone ~ radiation,data=loWind)

abline(loLmOut,col="black",lty=3)

#dev.off()

lmOut1 <- lm(ozone ~ radiation * wind, data=environmental)

summary(lmOut1)

#png("Figure09_6.png", width = 6, height = 6, units = 'in', res = 300)

plot(environmental$radiation,residuals(lmOut1))

abline(h=0)

#dev.off()

#png("Figure09_7.png", width = 6, height = 6, units = 'in', res = 300)

plot(environmental$ozone,residuals(lmOut1))

abline(h=0)

#dev.off()

# Sidebar 9.3 code

plot(environmental$radiation,environmental$ozone)

env <- environmental

env$radSqr <- env$radiation^2

lmOutQuad <- lm(ozone ~ radiation + wind + radSqr + radiation:wind, data=env)

summary(lmOutQuad)

pairs(environmental,panel=panel.smooth)

# Rerun the analysis with centered variables

stdenv <- data.frame(scale(environmental,center=TRUE,scale=FALSE))

lmOut2 <- lm(ozone ~ radiation * wind,data=stdenv)

summary(lmOut2)

lmOutSimple <- lm(ozone ~ radiation + wind,data=stdenv)

lmOutInteract <- lm(ozone ~ radiation + wind + radiation:wind,data=stdenv)

install.packages("lmSupport")

library(lmSupport)

modelCompare(lmOutSimple, lmOutInteract)

lmOutBayes1 <- lmBF(ozone ~ radiation + wind,data=stdenv)

lmOutBayes2 <- lmBF(ozone ~ radiation + wind + radiation:wind,data=stdenv)

lmOutBayes2/lmOutBayes1

mcmcOut <- lmBF(ozone ~ radiation + wind + radiation:wind,data=stdenv, posterior=TRUE,iterations=10000)

summary(mcmcOut)

rsqList <- 1 - (mcmcOut[,"sig2"] / var(stdenv$ozone))

mean(rsqList) # Overall mean R-squared

quantile(rsqList,c(0.025))

quantile(rsqList,c(0.975))

loWind$wind <- mean(loWind$wind) - sd(loWind$wind)

hiWind$wind <- mean(hiWind$wind) + sd(hiWind$wind)

loWindOzone <- modelPredictions(lmOut1, Data=loWind, Type = 'response')

hiWindOzone <- modelPredictions(lmOut1, Data=hiWind, Type = 'response')

#png("Figure09_8.png", width = 6, height = 6, units = 'in', res = 300)

plot(loWind$radiation,loWindOzone$Predicted,xlim=c(0,350),ylim=c(10,90))

points(hiWind$radiation,hiWindOzone$Predicted,pch=3)

#dev.off()

#####################################

# Chapter 10

# Create a sequence of 100 numbers, ranging from -6 to 6 to serve as the X variable

logistX <- seq(from=-6, to=6, length.out=100)

# Compute the logit function using exp(), the inverse of log()

logistY <- exp(logistX)/(exp(logistX)+1)

#png("Figure10_1.png", width = 6, height = 6, units = 'in', res = 300)

# Now review the beautiful S curve

plot(logistX,logistY)

#dev.off()

# Create a random, standard-normal predictor variable

set.seed(123)

logistX <- rnorm(n=100,mean=0,sd=1)

# Create an outcome variable as a logit function of the predictor

logistY <- exp(logistX)/(exp(logistX)+1)

# Make the dichotomous/binomial version of the outcome variable

binomY <- round(logistY)

# Add noise to the predictor so that it does not perfectly predict the outcome

logistX <- logistX/1.41 + rnorm(n=100,mean=0,sd=1)/1.41

#png("Figure10_2.png", width = 6, height = 6, units = 'in', res = 300)

plot(logistX, binomY)

#dev.off()

binomY <- factor(round(logistY), labels=c('Truth','Lie'))

logistDF <- data.frame(logistX, logistY, binomY) # Make data frame

#png("Figure10_3.png", width = 6, height = 6, units = 'in', res = 300)

boxplot(formula=logistX ~ binomY, data=logistDF, ylab="GSR", main=NULL)

#dev.off()

glmOut <- glm(binomY ~ logistX, data=logistDF, family=binomial())

summary(glmOut)

mean(residuals(glmOut))

hist(residuals(glmOut))

exp(coef(glmOut)) # Convert log odds to odds

exp(confint(glmOut)) # Look at confidence intervals around log-odds

anova(glmOut, test="Chisq") # Compare null model to one predictor model

#png("Figure10_4.png", width = 6, height = 6, units = 'in', res = 300)

par(mfrow=c(1,2)) # par() configures the plot area

plot(binomY, predict(glmOut),ylim=c(-4,4)) # Compare with earlier plot

plot(binomY, logistX,ylim=c(-4,4))

#dev.off()

# Logistic regression with a real data example

install.packages("car")

library(car)

data(Chile)

ChileY <- Chile[Chile$vote == "Y",] # Grab the Yes votes

ChileN <- Chile[Chile$vote == "N",] # Grab the No votes

ChileYN <- rbind(ChileY,ChileN) # Make a new dataset with those

ChileYN <- ChileYN[complete.cases(ChileYN),] # Get rid of missing data

ChileYN$vote <- factor(ChileYN$vote,levels=c("N","Y")) # Fix the factor

# dim(ChileYN)

# table(ChileYN$vote)

#png("Figure10_5.png", width = 6, height = 6, units = 'in', res = 300)

par(mfrow=c(1,2))

boxplot(age ~ vote, data=ChileYN, main=NULL)

boxplot(income ~ vote, data=ChileYN)

#dev.off()

chOut <- glm(formula = vote ~ age + income, family = binomial(), data = ChileYN)

# Intercept only model

#chOut <- glm(formula = vote ~ 1, family = binomial(), data = ChileYN)

summary(chOut)

exp(coef(chOut)) # Convert log odds to odds

exp(confint(chOut)) # Look at confidence intervals

anova(chOut, test="Chisq") # Compare null model to predictor models

# ChileYN$vote <- ChileYN$vote/10

install.packages("BaylorEdPsych")

library(BaylorEdPsych)

PseudoR2(chOut)

# hist(predict(chOut,type="response"))

table(round(predict(chOut,type="response")),ChileYN$vote)

# Now do it the Bayesian way

install.packages("MCMCpack") # Download MCMCpack package

library(MCMCpack) # Load the package

ChileYN$vote <- as.numeric(ChileYN$vote) - 1 # Adjust the outcome variable

bayesLogitOut <- MCMClogit(formula = vote ~ age + income, data = ChileYN)

summary(bayesLogitOut) # Summarize the results

#png("Figure10_6.png", width = 6, height = 6, units = 'in', res = 300)

plot(bayesLogitOut)

#dev.off()

exp(mean(bayesLogitOut[,"age"]))

exp(quantile(bayesLogitOut[,"age"],c(0.025)))

exp(quantile(bayesLogitOut[,"age"],c(0.975)))

ageLogOdds <- as.matrix(bayesLogitOut[,"age"])

ageOdds <- apply(ageLogOdds,1,exp)

#png("Figure10_7.png", width = 6, height = 6, units = 'in', res = 300)

hist(ageOdds, main=NULL)

abline(v=quantile(ageOdds,c(0.025)),col="black")

abline(v=quantile(ageOdds,c(0.975)),col="black")

#dev.off()

#####################################

# Chapter 11

#png("Figure11_1.png", width = 6, height = 6, units = 'in', res = 300)

boxplot(weight ~ Time, data=ChickWeight)

#dev.off()

# t-test

ch16index <- ChickWeight$Time == 16 # Chicks measured at time 16

ch18index <- ChickWeight$Time == 18 # Chicks measured at time 18

bothChicks <- ChickWeight[ch16index | ch18index,] # Both sets together

time16weight <- bothChicks[bothChicks$Time == 16,"weight"] # Grab the weights for t=16

time18weight <- bothChicks[bothChicks$Time == 18,"weight"] # Grab the weights for t=18

cor(time16weight,time18weight) # Are they correlated?

mean(time16weight)

mean(time18weight)

t.test(time18weight,time16weight,paired = FALSE) # Independent groups t-test

BESTmcmc(time18weight,time16weight)# Run the Bayesian equivalent

t.test(time18weight,time16weight,paired = TRUE) # Dependent groups t-test

weightDiffs <- time18weight - time16weight # Make difference scores

t.test(weightDiffs) # Run a one sample t-test on difference scores

BESTmcmc(weightDiffs) # Run the Bayesian equivalent on difference scores

# ANOVA within

chwBal <- ChickWeight # Copy the dataset

chwBal$TimeFact <- as.factor(chwBal$Time) # Convert Time to a factor

list <- rowSums(table(chwBal$Chick,chwBal$TimeFact))==12 # Make a list of rows

list <- list[list==TRUE] # Keep only those with 12 observations

list <- as.numeric(names(list)) # Extract the row indices

chwBal <- chwBal[chwBal$Chick %in% list,] # Match against the data

# table(chwBal$Chick,chwBal$TimeFact) # Check results

summary(aov(weight ~ TimeFact + Error(Chick), data=chwBal))

# Code for the Sidebar

library("ez")

install.packages("ez")

ezANOVA(data=chwBal, dv=.(weight), within=.(TimeFact), wid=.(Chick), detailed=TRUE)

# Treating time as a random variable

#summary(aov(weight ~ Time + Error(Chick), data=chwBal))

#summary(lme(weight ~ TimeFact, data=chwBal, random = ~1 | Chick))

set.seed(1234) # Control random numbers

tslen <- 180 # About half a year of daily points

ex1 <- rnorm(n=tslen,mean=0,sd=10) # Make a random variable

tex1 <- ex1 + seq(from=1, to=tslen, by=1) # Add the fake upward trend

#png("Figure11_2.png", width = 6, height = 6, units = 'in', res = 300)

plot.ts(tex1) # Plot the time series with a connected line

#dev.off()

ex2 <- rnorm(n=tslen,mean=0,sd=10) # Make another random variable

tex2 <- ex2 + seq(from=1, to=tslen, by=1) # Add the fake upward trend

cor(ex1, ex2) # Correlation between the two random variables

cor(tex1, tex2) # Correlation between the two time series

#png("Figure11_3.png", width = 6, height = 6, units = 'in', res = 300)

plot(tex1, tex2)

#dev.off()

ex3 <- rnorm(n=tslen,mean=0,sd=10)

tex3 <- ex3 + seq(from=1, to=tslen, by=1) # Add the fake upward trend

tex3 <- tex3 + sin(seq(from=0,to=36,length.out=tslen))*20

#png("Figure11_4.png", width = 6, height = 6, units = 'in', res = 300)

plot.ts(tex3)

#dev.off()

decOut <- decompose(ts(tex3,frequency=30))

#png("Figure11_5.png", width = 6, height = 6, units = 'in', res = 300)

plot(decOut)

#dev.off()

mean(decOut$trend,na.rm=T)

mean(decOut$seasonal)

mean(decOut$random,na.rm=T)

cor(ex3, decOut$random, use="complete.obs")

set.seed(1234)

tslen <- 180

ex1 <- rnorm(n=tslen,mean=0,sd=10) # Make a random variable

#png("Figure11_6.png", width = 6, height = 6, units = 'in', res = 300)

acf(ex1, main=NULL)

#dev.off()

tex1 <- ex1 + seq(from=1, to=tslen, by=1) # Add the fake upward trend

#png("Figure11_7.png", width = 6, height = 6, units = 'in', res = 300)

acf(tex1, main=NULL)

#dev.off()

ex3 <- rnorm(n=tslen,mean=0,sd=10)

tex3 <- ex3 + seq(from=1, to=tslen, by=1) # Add the fake upward trend

tex3 <- tex3 + sin(seq(from=0,to=36,length.out=tslen))*20

acf(tex3)

acf(decOut$trend,na.action=na.pass)

#png("Figure11_8.png", width = 6, height = 6, units = 'in', res = 300)

acf(decOut$seasonal, main=NULL)

#dev.off()

#png("Figure11_9.png", width = 6, height = 6, units = 'in', res = 300)

acf(decOut$random,na.action=na.pass, main=NULL)

#dev.off()

install.packages("tseries")

library("tseries")

decComplete <- decOut$random[complete.cases(decOut$random)]

adf.test(decComplete) # Shows significant, so it is stationary

#png("Figure11_10.png", width = 6, height = 6, units = 'in', res = 300)

plot(EuStockMarkets, main=NULL)

#dev.off()

#png("Figure11_11.png", width = 6, height = 6, units = 'in', res = 300)

plot(diff(EuStockMarkets), main=NULL)

#dev.off()

# The following code examines change point analysis

install.packages("changepoint")

library(changepoint)

# Use changepoint analysis to locate the positon of a mean change

DAX <- EuStockMarkets[,1]

DAXcp <- cpt.mean(DAX)

DAXcp

#png("Figure11_12.png", width = 6, height = 6, units = 'in', res = 300)

plot(DAXcp,cpt.col="grey",cpt.width=5)

#dev.off()

cpt.var(diff(EuStockMarkets[,"DAX"])) # Examine the change in variance

# Change to a simple output data structure to retrieve the confidence value

DAXcp <- cpt.mean(DAX,class=FALSE)

DAXcp["conf.value"]

# Now difference the DAX series and look for a change in variance

dEUstocks <- diff(EuStockMarkets)

plot(dEUstocks)

dDAX <- dEUstocks[,1]

dDAXcp <- cpt.var(dDAX)

plot(dDAXcp,cpt.col="grey",cpt.width=5)

dDAXcp

install.packages("bcp")

library(bcp)

bcpDAX <- bcp(as.vector(DAX))

#png("Figure11_13.png", width = 8, height = 6, units = 'in', res = 300)

plot(bcpDAX,outer.margins = list(left = unit(4,"lines"), bottom = unit(3, "lines"), right = unit(3, "lines"), top = unit(2,"lines")), main=NULL)

#dev.off()

#png("Figure11_14.png", width = 6, height = 6, units = 'in', res = 300)

plot(bcpDAX$posterior.prob>.95)

#dev.off()

# Sidebar 11.2

# Run a model with p=1, d=0, and q=1; hold out the last ten values

tsFit <- arima(LakeHuron[1:88], order=c(1,0,1)) # Fit the model

predict(tsFit,n.ahead=10) # Show the next ten predicted values

LakeHuron[89:98] # Compare with the actual values

# End of Sidebar 11.2

# Homework starter code

grp1 <- rnorm(100)

grp2 <- grp1 + runif(100,max=0.1)

t.test(grp1,grp2,paired=FALSE)

t.test(grp1,grp2,paired=TRUE)

#####################################

# Chapter 12

str(iris) # Reveal the data structure for the iris dataset

irisN <- subset(iris,select=-Species)

str(irisN)

round(cor(irisN),digits=3) # Show correlation matrix for the iris data

install.packages("psych")

library(psych)

irisNout <- principal(irisN)

irisNout

irisNout <- principal(irisN,nfactors=2)

irisNout

summary(irisN)

irisNS <- scale(irisN) # standardize each variable

flowerSize <- (irisNS[,1]+ irisNS[,3]+ irisNS[,4])/3 # All except Sepal.Width

mean(flowerSize)

sd(flowerSize)

# Sidebar 12.1 code

facScore1 <- irisNout$scores[,"RC1"]

facScore2 <- irisNout$scores[,"RC2"]

length(facScore1)

mean(facScore1)

sd(facScore1)

cor(facScore1,flowerSize)

cor(facScore2,flowerSize)

alpha(irisN,check.keys = TRUE)

irisNout <- principal(irisN,nfactors=2) # We just ran this earlier in the chapter

#png("Figure12_1.png", width = 5, height = 9, units = 'in', res = 300)

plot(irisNout)

#dev.off()

irisNout$rotation

irisNout <- principal(irisN,nfactors=2, rotate="none")

#png("Figure12_2.png", width = 5, height = 9, units = 'in', res = 300)

plot(irisNout)

#dev.off()

#####################################

# Appendix B

caseLabel <- c("A","B","C","D","E")

caseLabel

age <- c(43, 42, 12, 8, 5)

gender <- c("Male","Female","Female","Male","Female")

weight <- c(188,136,83,61,44)

myFamily <- data.frame(caseLabel, age, gender, weight)

str(myFamily)

summary(myFamily)

myFamily$age

age <- c(age, 11)

myFamily$age<-c(myFamily$age, 11)

#####################################

# Appendix C

install.packages("datasets")

install.packages("dplyr")

library(datasets)

library(dplyr)

data()

euStocks <- tbl_df(data.frame(EuStockMarkets))

euStocks

arrange(euStocks, DAX)

euStocks

euStocks <- arrange(euStocks, DAX)

euStocks

euStocks <- arrange(euStocks, desc(DAX), desc(SMI))

euStocks

euStocks <- select(euStocks, DAX, FTSE)

euStocks

euStocks <- mutate(euStocks, avindex = (DAX + FTSE)/2)

euStocks

filter(euStocks, avindex < mean(avindex))