Spatially Correlated Errors, Spatial Block Bootstrapping, Bag of Little Bootstrap (BLB)
Post date: Feb 14, 2015 7:08:44 PM
# Title: Spatial Bootstrapping (Spatial Block Bootstrapping, Bag of Little Bootstrap)
# Author: Haoying Wang, Cornell University
# Date: Feb 2015
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Settings
# Load packages
rm(list=ls()) # clean the workspace
packagelist <- c("rgdal", "foreign","RColorBrewer","plyr","raster","ncdf","rgeos",
"maptools","rgeos","rasterVis","gstat","caret")
newpackages <- packagelist[!(packagelist %in% installed.packages()[,"Package"])]
if(length(newpackages)) install.packages(newpackages)
lapply(packagelist, function(i) require(i, character.only=TRUE))
remove(packagelist, newpackages)
# Useful library
library(sp) # classes for spatial data
library(raster) # grids, rasters
library(rasterVis) # raster visualisation
library(maptools)
library(rgeos)
library(spdep)
library(gstat)
library(nlme) # for generalized least squares
library(caret) # for spliting data frame
# Key settings
dir <- list()
# Haoying's Dropbox, consider customizing to your computer
dir$root <- paste(rev(rev(strsplit(getwd(),"/")[[1]])[-1]),collapse="/")
dir$data <- paste(dir$root,"/R_code",sep="")
list.files(dir$data)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Step 0: Macro Parameter Settings
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
size_lon <- 1000 # longitude length of the population
size_lat <- 1000 # latitude length of the population
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Step 1: general error terms with spatial correlation - one sample
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# a raster without spatial correlation
# ptm_e1 <- proc.time() # start the clock
# e1 <- matrix(rnorm(size_lon*size_lat, mean = 0, sd = 1),size_lon,size_lat)
# Turn the matrix into a raster
# e1_rast <- raster(e1)
# Give it lat/lon coords for 0-1°E, 0-1°S
# extent(e1_rast) <- c(0,1,0,1) # use same extent for all layers
# plot(e1_rast)
# proc.time() - ptm_e1 # Stop the clock
# a raster with spatial correlation
ptm_e <- proc.time() # start the clock
ee <- expand.grid(1:200, 1:200)
names(ee) <- c('ee1','ee2')
# define the gstat object (spatial model)
g.dummy <- gstat(formula=z~1, locations=~ee1+ee2, dummy=T,
beta=1, model=vgm(psill=1,model='Exp',range=50), nmax=20)
# make four simulations based on the gstat object
e <- predict(g.dummy, newdata=ee, nsim=1)
# show one realisation
gridded(e) = ~ee1+ee2
spplot(e)
e_rast <- (raster(e) - mean(as.matrix(raster(e))))/sd(as.matrix(raster(e))) # standardize sample
extent(e_rast) <- c(0,1,0,1) # use same extent for all layers
plot(e_rast)
proc.time() - ptm_e # Stop the clock
# change the resoultion of the raster to size_lon*size_lat
# note: 'bilinear' - values are locally interpolated (using the resample function)
# note: for disaggregate() function, 'fact' has to be an integer
e_new <- disaggregate(e_rast, fact=c(size_lon/200,size_lat/200), method='bilinear')
# compare two different resolutions
par(mfcol = c(1, 2))
plot(e_rast, main = "Resolution - Low")
plot(e_new, main = "Resolution - High")
par(mfcol = c(1, 1))
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Step 2: generate population
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# population size
pop_size <- 300
ptm_e <- proc.time() # start the clock
ee <- expand.grid(1:200, 1:200)
names(ee) <- c('ee1','ee2')
# define the gstat object (spatial model)
g.dummy <- gstat(formula=z~1, locations=~ee1+ee2, dummy=T,
beta=1, model=vgm(psill=1,model='Exp',range=50), nmax=20)
# make four simulations based on the gstat object
e <- predict(g.dummy, newdata=ee, nsim=pop_size)
# show one realisation
gridded(e) = ~ee1+ee2
spplot(e[1:4], main = "Resolution - 200 * 200")
# standardize the raster stack
e_rast <- brick(e)
for (i in 1:pop_size)
{
e_rast[[i]] <- (e_rast[[i]] - colMeans(as.matrix(e_rast[[i]])))/sd(as.matrix(e_rast[[i]]))
print(i)
}
extent(e_rast) <- c(0,1,0,1) # use same extent for all layers
plot(e_rast[[1:4]], main = "Resolution - 200 * 200")
proc.time() - ptm_e # Stop the clock
# change the resoultion of the raster to size_lon*size_lat
# note: 'bilinear' - values are locally interpolated (using the resample function)
# note: for disaggregate() function, 'fact' has to be an integer
e_new <- disaggregate(e_rast, fact=c(size_lon/200,size_lat/200), method='bilinear')
plot(e_new[[1:4]], main = "Resolution - 1000 * 1000")
# save the raster brick
# writeRaster(e_new[[1:50]], filename="e_rast_1000res1.tif", format="GTiff", overwrite=TRUE)
# writeRaster(e_new[[51:100]], filename="e_rast_1000res2.tif", format="GTiff", overwrite=TRUE)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Step 3: general dependent & independent variables
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# generate independent variable
x <- matrix(rnorm(size_lon*size_lat, mean = 1, sd = 1),size_lon,size_lat)
# Turn the matrix into a raster
x_rast <- raster(x)
extent(x_rast) <- c(0,1,0,1) # use same extent for all layers
# plot(x_rast,main = "X - resolution - 200 * 200")
# generate intercept
intercept <- matrix(1,size_lon,size_lat)
# Turn the matrix into a raster
intercept_rast <- raster(intercept)
extent(intercept_rast) <- c(0,1,0,1) # use same extent for all layers
# generate dependent variable
# initialize the stack, set values to NA
y_rast <- e_new
y_rast <- setValues(y_rast, NA)
for (i in 1:pop_size)
{
y_rast[[i]] <- sum(stack(intercept_rast, x_rast, e_new[[i]]))
print(i)
}
plot(y_rast[[1:4]], main = "Y - resolution - 1000 * 1000")
# assmeble data - with spatial correlation
data_rast <- stack(y_rast,x_rast)
names(data_rast[[301]]) <- c("x_rast") # name the independent variable
data <- as.data.frame(data_rast)
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Step 4: simple linear regression
# Note: tried to run generalized least squares with an 1000*1000 observations,
# but unsuccessful!
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# data with spatial correlation
ptm_glm <- proc.time() # start the clock
b_ols <- matrix(,pop_size,1)
for (i in 1:pop_size)
{
fit <- lm(data[,i]~x_rast,data=data) # Ordinary Least Squares
b_ols[i] <- coef(fit)[2] # extract estimate on X coefficient
print(i)
}
proc.time() - ptm_glm # Stop the clock
plot(density(b_ols),col="blue",lwd=2,xlim=c(0.995,1.005), main="Ordinary Least Squares")
abline(v = 1, col = "black",lty=2) # true beta
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Step 5: spatial block bootstrapping
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# parameter setting for bootstrapping
b <- 90000 # block size for bootstrapping
b_lon <- ceiling(sqrt(b)) # longitude length of block
b_lat <- ceiling(sqrt(b)) # latitude length of block
tolerance <- 0.2; # tolerance for percentage of 'NA' values in selected blocks
draws <- 500 # number of draws for block bootstrapping
map = stack(data_rast[[sample(1:pop_size,1,replace = TRUE)]],data_rast[[pop_size+1]])
nx <- nrow(map)
ny <- ncol(map)
b_boot1 <- matrix(,draws,1)
ptm_block1 <- proc.time() # start the clock
for (i in 1:draws)
{
na_counter <- 1 # a counter for 'NA' values in selected block
while(na_counter > tolerance)
{
sub_x <- sample(1:nx-b_lon,1,replace = TRUE) # determine the starting point of block - X dimension
sub_y <- sample(1:ny-b_lat,1,replace = TRUE) # determine the starting point of block - Y dimension
multi_block <- matrix(,b_lon*b_lat,nlayers(map)) # initialize the outcome matrix
multi_block <- getValuesBlock(map,sub_x,b_lon,sub_y,b_lat,lyr=1:nlayers(map))
na_mat <- as.vector(1:nlayers(map))
for(j in 1:nlayers(map))
{
na_mat[j] <- sum(is.na(multi_block[,j]))/length(multi_block[,j])
}
na_counter <- min(na_mat)
if(na_counter <= tolerance)
{ break
cat('\n','Percentage of NULL cell values:',na_counter*100,',no block selected!')
}
}
data_block <- as.data.frame(multi_block)
names(data_block) <- c("y_rast","x_rast")
fit <- lm(y_rast~x_rast,data=data_block) # Ordinary Least Squares
b_boot1[i] <- coef(fit)[2] # extract estimate on X coefficient
print(i)
}
proc.time() - ptm_block1 # Stop the clock
plot(density(b_boot1),col="blue",lwd=2,xlim=c(0.99,1.01),ylim=c(0,800),main="Ordinary Least Squares")
lines(density(b_ols),col="red",lwd=2)
legend("topright", inset=.05, title="Spatial Block Bootstrapping",
c("Estimated Distribution","True Distribution"),col=c("blue","red"),lwd=2,cex = 0.8)
abline(v = 1, col = "black",lty=2) # true beta
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# Step 6: spatial bootstrapping - little bag of bootstraps
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# parameter setting for bootstrapping
# note: total population size = size_lon*size_lat
b_num <- 10 # number of bags
draws <- 200 # number of draws within each bag
map = stack(data_rast[[sample(1:pop_size,1,replace = TRUE)]],data_rast[[pop_size+1]])
mapdata <- as.data.frame(map)
names(mapdata) <- c("y","x")
# split the data into 'b_num' random bags
bags <- createFolds(mapdata$y, k = b_num, list = TRUE, returnTrain = FALSE)
b_boot2 <- matrix(,draws,b_num)
ptm_block2 <- proc.time() # start the clock
for (i in 1:b_num)
{
bag <- mapdata[bags[[i]],c("y","x")]
for (j in 1:draws)
{
# resample the entire population from the i-th bag
bag_data <- bag[sample(nrow(bag), nrow(mapdata), replace = TRUE),]
fit <- lm(y~x,data=bag_data) # Ordinary Least Squares
b_boot2[j,i] <- coef(fit)[2] # extract estimate on X coefficient
print(c(i,j)) # tracking both loops
}
}
proc.time() - ptm_block2 # Stop the clock
# plotting for all bags
par(mfrow=c(3,4))
for (p in 1:b_num)
{
plot(density(b_boot2[,p]),col="blue",lwd=2,xlim=c(0.99,1.01),ylim=c(0,750),
main="Little Bag of Bootstrap")
lines(density(b_ols),col="red",lwd=2)
legend("topright", inset=.05, title="Spatial Block Bootstrapping",
c("Estimated Distribution","True Distribution"),col=c("blue","red"),lwd=2,cex = 0.2)
abline(v = 1, col = "black",lty=2) # true beta
}
par(mfrow=c(1,1))