Spatially Correlated Errors, Spatial Block Bootstrapping, Bag of Little Bootstrap (BLB)

Post date: Feb 14, 2015 7:08:44 PM

# Title: Spatial Bootstrapping (Spatial Block Bootstrapping, Bag of Little Bootstrap)

# Author: Haoying Wang, Cornell University

# Date: Feb 2015

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Settings

# Load packages

rm(list=ls()) # clean the workspace

packagelist <- c("rgdal", "foreign","RColorBrewer","plyr","raster","ncdf","rgeos",

"maptools","rgeos","rasterVis","gstat","caret")

newpackages <- packagelist[!(packagelist %in% installed.packages()[,"Package"])]

if(length(newpackages)) install.packages(newpackages)

lapply(packagelist, function(i) require(i, character.only=TRUE))

remove(packagelist, newpackages)

# Useful library

library(sp) # classes for spatial data

library(raster) # grids, rasters

library(rasterVis) # raster visualisation

library(maptools)

library(rgeos)

library(spdep)

library(gstat)

library(nlme) # for generalized least squares

library(caret) # for spliting data frame

# Key settings

dir <- list()

# Haoying's Dropbox, consider customizing to your computer

dir$root <- paste(rev(rev(strsplit(getwd(),"/")[[1]])[-1]),collapse="/")

dir$data <- paste(dir$root,"/R_code",sep="")

list.files(dir$data)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Step 0: Macro Parameter Settings

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

size_lon <- 1000 # longitude length of the population

size_lat <- 1000 # latitude length of the population

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Step 1: general error terms with spatial correlation - one sample

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# a raster without spatial correlation

# ptm_e1 <- proc.time() # start the clock

# e1 <- matrix(rnorm(size_lon*size_lat, mean = 0, sd = 1),size_lon,size_lat)

# Turn the matrix into a raster

# e1_rast <- raster(e1)

# Give it lat/lon coords for 0-1°E, 0-1°S

# extent(e1_rast) <- c(0,1,0,1) # use same extent for all layers

# plot(e1_rast)

# proc.time() - ptm_e1 # Stop the clock

# a raster with spatial correlation

ptm_e <- proc.time() # start the clock

ee <- expand.grid(1:200, 1:200)

names(ee) <- c('ee1','ee2')

# define the gstat object (spatial model)

g.dummy <- gstat(formula=z~1, locations=~ee1+ee2, dummy=T,

beta=1, model=vgm(psill=1,model='Exp',range=50), nmax=20)

# make four simulations based on the gstat object

e <- predict(g.dummy, newdata=ee, nsim=1)

# show one realisation

gridded(e) = ~ee1+ee2

spplot(e)

e_rast <- (raster(e) - mean(as.matrix(raster(e))))/sd(as.matrix(raster(e))) # standardize sample

extent(e_rast) <- c(0,1,0,1) # use same extent for all layers

plot(e_rast)

proc.time() - ptm_e # Stop the clock

# change the resoultion of the raster to size_lon*size_lat

# note: 'bilinear' - values are locally interpolated (using the resample function)

# note: for disaggregate() function, 'fact' has to be an integer

e_new <- disaggregate(e_rast, fact=c(size_lon/200,size_lat/200), method='bilinear')

# compare two different resolutions

par(mfcol = c(1, 2))

plot(e_rast, main = "Resolution - Low")

plot(e_new, main = "Resolution - High")

par(mfcol = c(1, 1))

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Step 2: generate population

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# population size

pop_size <- 300

ptm_e <- proc.time() # start the clock

ee <- expand.grid(1:200, 1:200)

names(ee) <- c('ee1','ee2')

# define the gstat object (spatial model)

g.dummy <- gstat(formula=z~1, locations=~ee1+ee2, dummy=T,

beta=1, model=vgm(psill=1,model='Exp',range=50), nmax=20)

# make four simulations based on the gstat object

e <- predict(g.dummy, newdata=ee, nsim=pop_size)

# show one realisation

gridded(e) = ~ee1+ee2

spplot(e[1:4], main = "Resolution - 200 * 200")

# standardize the raster stack

e_rast <- brick(e)

for (i in 1:pop_size)

{

e_rast[[i]] <- (e_rast[[i]] - colMeans(as.matrix(e_rast[[i]])))/sd(as.matrix(e_rast[[i]]))

print(i)

}

extent(e_rast) <- c(0,1,0,1) # use same extent for all layers

plot(e_rast[[1:4]], main = "Resolution - 200 * 200")

proc.time() - ptm_e # Stop the clock

# change the resoultion of the raster to size_lon*size_lat

# note: 'bilinear' - values are locally interpolated (using the resample function)

# note: for disaggregate() function, 'fact' has to be an integer

e_new <- disaggregate(e_rast, fact=c(size_lon/200,size_lat/200), method='bilinear')

plot(e_new[[1:4]], main = "Resolution - 1000 * 1000")

# save the raster brick

# writeRaster(e_new[[1:50]], filename="e_rast_1000res1.tif", format="GTiff", overwrite=TRUE)

# writeRaster(e_new[[51:100]], filename="e_rast_1000res2.tif", format="GTiff", overwrite=TRUE)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Step 3: general dependent & independent variables

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# generate independent variable

x <- matrix(rnorm(size_lon*size_lat, mean = 1, sd = 1),size_lon,size_lat)

# Turn the matrix into a raster

x_rast <- raster(x)

extent(x_rast) <- c(0,1,0,1) # use same extent for all layers

# plot(x_rast,main = "X - resolution - 200 * 200")

# generate intercept

intercept <- matrix(1,size_lon,size_lat)

# Turn the matrix into a raster

intercept_rast <- raster(intercept)

extent(intercept_rast) <- c(0,1,0,1) # use same extent for all layers

# generate dependent variable

# initialize the stack, set values to NA

y_rast <- e_new

y_rast <- setValues(y_rast, NA)

for (i in 1:pop_size)

{

y_rast[[i]] <- sum(stack(intercept_rast, x_rast, e_new[[i]]))

print(i)

}

plot(y_rast[[1:4]], main = "Y - resolution - 1000 * 1000")

# assmeble data - with spatial correlation

data_rast <- stack(y_rast,x_rast)

names(data_rast[[301]]) <- c("x_rast") # name the independent variable

data <- as.data.frame(data_rast)

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Step 4: simple linear regression

# Note: tried to run generalized least squares with an 1000*1000 observations,

# but unsuccessful!

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# data with spatial correlation

ptm_glm <- proc.time() # start the clock

b_ols <- matrix(,pop_size,1)

for (i in 1:pop_size)

{

fit <- lm(data[,i]~x_rast,data=data) # Ordinary Least Squares

b_ols[i] <- coef(fit)[2] # extract estimate on X coefficient

print(i)

}

proc.time() - ptm_glm # Stop the clock

plot(density(b_ols),col="blue",lwd=2,xlim=c(0.995,1.005), main="Ordinary Least Squares")

abline(v = 1, col = "black",lty=2) # true beta

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Step 5: spatial block bootstrapping

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# parameter setting for bootstrapping

b <- 90000 # block size for bootstrapping

b_lon <- ceiling(sqrt(b)) # longitude length of block

b_lat <- ceiling(sqrt(b)) # latitude length of block

tolerance <- 0.2; # tolerance for percentage of 'NA' values in selected blocks

draws <- 500 # number of draws for block bootstrapping

map = stack(data_rast[[sample(1:pop_size,1,replace = TRUE)]],data_rast[[pop_size+1]])

nx <- nrow(map)

ny <- ncol(map)

b_boot1 <- matrix(,draws,1)

ptm_block1 <- proc.time() # start the clock

for (i in 1:draws)

{

na_counter <- 1 # a counter for 'NA' values in selected block

while(na_counter > tolerance)

{

sub_x <- sample(1:nx-b_lon,1,replace = TRUE) # determine the starting point of block - X dimension

sub_y <- sample(1:ny-b_lat,1,replace = TRUE) # determine the starting point of block - Y dimension

multi_block <- matrix(,b_lon*b_lat,nlayers(map)) # initialize the outcome matrix

multi_block <- getValuesBlock(map,sub_x,b_lon,sub_y,b_lat,lyr=1:nlayers(map))

na_mat <- as.vector(1:nlayers(map))

for(j in 1:nlayers(map))

{

na_mat[j] <- sum(is.na(multi_block[,j]))/length(multi_block[,j])

}

na_counter <- min(na_mat)

if(na_counter <= tolerance)

{ break

cat('\n','Percentage of NULL cell values:',na_counter*100,',no block selected!')

}

}

data_block <- as.data.frame(multi_block)

names(data_block) <- c("y_rast","x_rast")

fit <- lm(y_rast~x_rast,data=data_block) # Ordinary Least Squares

b_boot1[i] <- coef(fit)[2] # extract estimate on X coefficient

print(i)

}

proc.time() - ptm_block1 # Stop the clock

plot(density(b_boot1),col="blue",lwd=2,xlim=c(0.99,1.01),ylim=c(0,800),main="Ordinary Least Squares")

lines(density(b_ols),col="red",lwd=2)

legend("topright", inset=.05, title="Spatial Block Bootstrapping",

c("Estimated Distribution","True Distribution"),col=c("blue","red"),lwd=2,cex = 0.8)

abline(v = 1, col = "black",lty=2) # true beta

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# Step 6: spatial bootstrapping - little bag of bootstraps

# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# parameter setting for bootstrapping

# note: total population size = size_lon*size_lat

b_num <- 10 # number of bags

draws <- 200 # number of draws within each bag

map = stack(data_rast[[sample(1:pop_size,1,replace = TRUE)]],data_rast[[pop_size+1]])

mapdata <- as.data.frame(map)

names(mapdata) <- c("y","x")

# split the data into 'b_num' random bags

bags <- createFolds(mapdata$y, k = b_num, list = TRUE, returnTrain = FALSE)

b_boot2 <- matrix(,draws,b_num)

ptm_block2 <- proc.time() # start the clock

for (i in 1:b_num)

{

bag <- mapdata[bags[[i]],c("y","x")]

for (j in 1:draws)

{

# resample the entire population from the i-th bag

bag_data <- bag[sample(nrow(bag), nrow(mapdata), replace = TRUE),]

fit <- lm(y~x,data=bag_data) # Ordinary Least Squares

b_boot2[j,i] <- coef(fit)[2] # extract estimate on X coefficient

print(c(i,j)) # tracking both loops

}

}

proc.time() - ptm_block2 # Stop the clock

# plotting for all bags

par(mfrow=c(3,4))

for (p in 1:b_num)

{

plot(density(b_boot2[,p]),col="blue",lwd=2,xlim=c(0.99,1.01),ylim=c(0,750),

main="Little Bag of Bootstrap")

lines(density(b_ols),col="red",lwd=2)

legend("topright", inset=.05, title="Spatial Block Bootstrapping",

c("Estimated Distribution","True Distribution"),col=c("blue","red"),lwd=2,cex = 0.2)

abline(v = 1, col = "black",lty=2) # true beta

}

par(mfrow=c(1,1))