Here returns to the code page.
Chown (2016) investigates the residual-based empirical distribution function in heteroscedastic nonparametric regression when responses from these models are missing at random.
R code:
################################################################################
## Simulation to demonstrate the root-n consistency of F-hat via the MSE and ##
## MISE for finite samples. ##
################################################################################
## Required libraries.
require(tcltk)
## Set the correct working directory.
setwd("~/Documents/Published/heterrdf/simulations/")
## Custom functions.
## Regression function at x.
reg_x = function(x) {
return( 1 + x[1] - x[2] + 2 * exp( -sqrt( crossprod(x) ) / 2 ) )
}
## Wrapper function to compute regression for many points.
reg = function(xs) {
return( apply(xs, 1, reg_x) )
}
## Scale function at x.
scl_x = function(x) {
return( sqrt( 1 + 2 * crossprod(x) ) )
}
## Wrapper function to compute scale for many points.
scl = function(x) {
return( apply(x, 1, scl_x) )
}
## Missingness probability function at x.
PI_x = function(x) {
return( 1 / ( 1 + exp( -( x[1] + x[2] ) / 2 ) ) )
}
## Wrapper function to compute missingness probability for many points.
PI = function(x) {
return( apply(x, 1, PI_x) )
}
## Generate random missingness indicators.
deltas = function(probs) {
return( ( runif( length(probs) ) > probs ) * 1 )
}
## Computes a product of cosine kernel functions.
w_cos = function(u) {
## Cosine kernel values for first variable.
k1 = pi * cos( pi * u[, 1] / 2 ) * ( abs(u[, 1]) <= 1 ) / 4
## Cosine kernel values for second variable.
k2 = pi * cos( pi * u[, 2] / 2 ) * ( abs(u[, 2]) <= 1 ) / 4
## Return product of kernel values.
return( k1 * k2 )
}
## Computes the matrix of the local cubic terms.
psi = function(u) {
## First variable.
u1 = u[, 1]
## Second variable.
u2 = u[, 2]
## Lexicographic ordering of polynomial terms.
p00 = 1
p01 = u2
p02 = u2 ^ 2 / 2
p03 = u2 ^ 3 / 6
p10 = u1
p11 = u1 * u2
p12 = u1 * u2 ^ 2 / 2
p20 = u1 ^ 2 / 2
p21 = u2 * u1 ^ 2 / 2
p30 = u1 ^ 3 / 6
## Return the psi matrix
return( cbind(p00, p01, p02, p03, p10, p11, p12, p20, p21, p30) )
}
## Computes the local cubic estimate at x.
P3_fit_x = function(x, xdata, ydata, bwd) {
## Sample size.
nsamp = length(ydata)
## Transform xdata to localized covariates.
U = cbind(x[1] - xdata[, 1], x[2] - xdata[, 2]) / bwd
## Diagonal weight matrix.
W = diag( w_cos(U) )
## n-by-10 matrix of polynomial terms.
Psi = psi(U)
## 10-by-10 Gram matrix for P3 basis.
G = t(Psi) %*% W %*% Psi
## Inner products between ydata and P3.
v = t(Psi) %*% W %*% matrix(ydata, nrow=nsamp)
## Apply regularization only to small samples.
if( nsamp <= 150 ) { G = G + diag(1e-6, 10) }
## Local polynomial coefficients.
betahat = qr.solve(G, v)
## Return the first component.
return( betahat[1] )
}
## Wrapper function to compute P3_fit_x for many points.
P3_fit = function(xs, xdata, ydata, bwd) {
return( apply(xs, 1, P3_fit_x, xdata, ydata, bwd) )
}
## Computes Fhat.
Fhat = function(us, res) {
return( sapply(us, function(u, res) { return( mean( res <= u ) ) }, res) )
}
## Computes the influence function for Fhat in the normal errors model.
bnorm = function(us, es) {
return( sapply(us, function(u, es) {
return( 1 * ( es <= u ) - pnorm(u) +
( es + u * ( es ^ 2 - 1 ) / 2 ) * dnorm(u) ) }, es) )
}
## Computes ( Fhat - F ) ^ 2 for the normal errors case.
diff_sq_norm = function(us, res) {
return( ( Fhat(us, res) - pnorm(us) ) ^ 2 )
}
## Simulation.
## Number of simulation runs.
runs = 1000
## Sample sizes 100, 200, 500 and 1000.
samples = c(100, 200, 500, 1000)
## t values to check inflated MSE.
tseq = c(-3, -2, -1, 0)
## Fhat - F values.
Fhat_diff = array(NA, c(runs, length(tseq)))
## Fhat ISE values.
Fhat_ISE = rep(NA, runs)
## Fhat bias values.
Fhat_bias = array(NA, c(length(samples), length(tseq)))
## Fhat variance values.
Fhat_var = array(NA, c(length(samples), length(tseq)))
## Fhat MSE values.
Fhat_MSE = array(NA, c(length(samples), length(tseq)))
## Fhat MISE values.
Fhat_MISE = rep(NA, length(samples))
## Open a progress bar.
pb = tkProgressBar(title="Fhat Simulation", min=0, max=runs)
## Simulation loop.
for(n in samples) {
## Fill progress bar with current information.
setTkProgressBar(pb, 0, label=paste0("Sample size: ", n))
## Set initial time difference to zero.
avg_time_diff = 0
## Get index for n in samples.
nind = which( samples == n )
for(s in 1 : runs) {
## Start a computing timer.
init_time = proc.time()[3]
## Covariates.
X = matrix(runif(2 * n, -1, 1), ncol=2)
## Responses generated through model equation.
Y = reg(X) + scl(X) * rnorm(n)
## Missingness indicators.
Z = deltas( PI(X) )
## Complete cases
X1 = X[which( Z == 1 ), ]
Y1 = Y[ which( Z == 1 ) ]
## A suitable bandwidth.
bwd_reg = 3 * ( n * log(n) ) ^ ( -1 / 7 )
## Complete case fitted values.
fits_CC = P3_fit(X1, X1, Y1, bwd_reg)
## Complete case second conditional moment estimates.
m2_fits_CC = P3_fit(X1, X1, Y1 ^ 2, bwd_reg)
## Complete case scale values.
scl_CC = sqrt( abs( m2_fits_CC - fits_CC ^ 2 ) )
## Complete case standardized residuals.
res_CC = ( Y1 - fits_CC ) / scl_CC
## Fhat pointwise differences.
Fhat_diff[s, ] = Fhat(tseq, res_CC) - pnorm(tseq)
## Fhat ISE.
Fhat_ISE[s] = integrate(diff_sq_norm, -Inf, Inf, res_CC,
stop.on.error=FALSE)$value
## One step corrected estimate of mean computing time.
avg_time_diff = ( ( s - 1 ) * avg_time_diff +
proc.time()[3] - init_time ) / s
## Estimated time to finish.
ETF = round(avg_time_diff * ( runs - s ) / 3600, digits=1)
## Updated progress bar information.
pb_msg = paste0("Sample size: ", n, " (", ETF, " hour(s) to go)")
## Update progress bar.
setTkProgressBar(pb, s, label=pb_msg)
}
## Fhat inflated bias.
Fhat_bias[nind, ] = sqrt(n) * apply(Fhat_diff, 2, mean)
## Fhat inflated variance.
Fhat_var[nind, ] = n * apply(Fhat_diff, 2, var)
## Fhat inflated MSE.
Fhat_MSE[nind, ] = Fhat_bias[nind, ] ^ 2 + Fhat_var[nind, ]
## Fhat inflated MISE.
Fhat_MISE[nind] = n * mean(Fhat_ISE)
}
## Close the progress bar.
close(pb)
## Simulation results.
## Average amount of missing data.
avg_miss = 1 / 2
## True inflated MSE values for Fhat.
iMSE_norm = sapply(tseq, function(u) {
return( integrate(function(v, u) {
return( bnorm(u, v) ^ 2 * dnorm(v) ) }, -Inf, Inf, u,
stop.on.error=FALSE)$value ) } ) / avg_miss
## Approximate inflated MISE value for Fhat.
iMISE_norm = sum( sapply(seq(-4, 4, by=0.0008), function(u) {
return( integrate(function(v, u) {
return( bnorm(u, v) ^ 2 * dnorm(v) ) }, -Inf, Inf, u,
stop.on.error=FALSE)$value ) } ) ) * 0.0008 / avg_miss
## Fhat bias results.
bias_results = cbind(samples, Fhat_bias)
## Fhat variance results.
var_results = cbind(samples, Fhat_var)
## Fhat MSE results.
MSE_results = cbind(c(samples, Inf), rbind(Fhat_MSE, iMSE_norm))
## Fhat MISE results.
MISE_results = rbind(c(samples, Inf), c(Fhat_MISE, iMISE_norm))
## Display simulation summary tables.
round(bias_results, digits=3)
round(var_results, digits=3)
round(MSE_results, digits=3)
round(MISE_results, digits=3)
## Save results to a file.
save(bias_results, var_results, MSE_results, MISE_results,
file="heterrdf_results.RData")
Python code:
###############################################################################
## A python simulation to demonstrate the root-n consistency of F-hat via ##
## the MSE and IMSE for finite samples. ##
###############################################################################
## Required libraries.
import numpy as np
import scipy as sp
from scipy.stats import uniform
from scipy.stats import norm
from progress.bar import Bar
from tabulate import tabulate
## Progress bar class.
class FancyBar(Bar):
suffix='%(percent)d%% - %(ETF).1fh to go.'
@property
def ETF(self):
return self.eta / 3600.0
## Custom functions.
## The regression function at x.
def reg_x(x):
return( 1.0 + x[0] - x[1]
+ 2.0 * np.exp( -np.sqrt( np.dot(x, x) ) / 2.0 ) )
## Wrapper function to compute regression for many points.
def reg(xs):
## Regression values.
vals = np.empty(xs.shape[0])
## For each value in xs compute the regression value.
for i in range(xs.shape[0]):
vals[i] = reg_x(xs[i, :])
## Return the regression values.
return(vals)
## Scale function at x.
def scl_x(x):
return( np.sqrt( 1.0 + 2.0 * np.dot(x, x) ) )
## Wrapper function to compute scale for many points.
def scl(xs):
## Scale values.
vals = np.empty(xs.shape[0])
## For each value in xs compute the scale value.
for i in range(xs.shape[0]):
vals[i] = scl_x(xs[i, :])
## Return the regression values.
return(vals)
## Missingness probability function at x.
def PI_x(x):
return( 1.0 / ( 1.0 + np.exp( -( x[0] + x[1] ) / 2.0 ) ) )
## Wrapper function to compute missingness probability for many points.
def PI(xs):
## Missingness probability values.
vals = np.empty(xs.shape[0])
## For each value in xs compute the missingness probability value.
for i in range(xs.shape[0]):
vals[i] = PI_x(xs[i, :])
## Return the values.
return(vals)
## Generate random missingness indicators.
def deltas(probs):
return( 1 * ( uniform.rvs(size=probs.size) > probs ) )
## Computes a product of cosine kernel functions.
def w_cos(u):
## Cosine kernel values for first variable.
k1 = ( np.pi * np.cos( np.pi * u[:, 0] / 2.0 )
* ( np.abs(u[:, 0]) <= 1 ) / 4.0 )
k2 = ( np.pi * np.cos( np.pi * u[:, 1] / 2.0 )
* ( np.abs(u[:, 1]) <= 1 ) / 4.0 )
## Return product of kernel values.
return( k1 * k2 )
## Computes the matrix of local cubic terms.
def psi(u):
## First variable.
u1 = u[:, 0]
## Second variable.
u2 = u[:, 1]
## Lexicographic ordering of polynomial terms.
p00 = np.ones(u.shape[0])
p01 = u2
p02 = u2 ** 2.0 / 2.0
p03 = u2 ** 3.0 / 6.0
p10 = u1
p11 = u1 * u2
p12 = u1 * u2 ** 2.0 / 2.0
p20 = u1 ** 2.0 / 2.0
p21 = u2 * u1 ** 2.0 / 2.0
p30 = u1 ** 3.0 / 6.0
## Return the psi matrix.
return( np.column_stack((p00, p01, p02, p03, p10, p11, p12,
p20, p21, p30)) )
## Computes the local cubic estimate at x.
def P3_fit_x(x, xdata, ydata, bwd):
## Sample size.
nsamp = ydata.size
## Transform xdata to localized covariates.
U = np.column_stack( (x[0] - xdata[:, 0], x[1] - xdata[:, 1]) ) / bwd
## Diagonal weight matrix.
W = np.diag( w_cos(U) )
## n-by-10 matrix of polynomial terms.
Psi = psi(U)
## 10-by-10 Gram matrix for weighted P3 basis.
G = np.matmul(np.matmul(Psi.T, W), Psi)
## Weighted inner products between ydata and P3.
v = np.matmul(np.matmul(Psi.T, W), ydata.reshape( (nsamp, 1) ))
## Apply regularization to small samples.
if nsamp <= 150:
G = G + np.diag(np.ones(10) / 1.0e6)
## Local polynomial coefficients.
betahat = sp.linalg.solve(G, v, assume_a='pos')
## Return the first component.
return( betahat[0][0] )
## Wrapper function to compute local cubic estimates for many points.
def P3_fit(xs, xdata, ydata, bwd):
## Estimates.
est = np.empty(xs.shape[0])
## For each value of xs calculate the regression estimate.
for i in range(xs.shape[0]):
est[i] = P3_fit_x(xs[i, :], xdata, ydata, bwd)
## Return the estimates.
return(est)
## Computes Fhat.
def Fhat(us, res):
## Estimates.
est = np.empty(us.size)
## Compute Fhat for each u value in us.
for i in range(us.size):
est[i] = np.mean( res <= us[i] )
## Return the estimates.
return(est)
## Computes the influence function for Fhat in the normal model.
def b_norm(es, u):
return( 1.0 * ( es <= u ) - norm.cdf(u)
+ ( es + u * ( es ** 2.0 - 1.0 ) / 2.0 ) * norm.pdf(u) )
## Integrand of true inflated MSE.
def kern_iMSE(es, u):
return( b_norm(es, u) ** 2.0 * norm.pdf(es) )
## Integrand of the true inflated IMSE.
def kern_iIMSE(u):
return( sp.integrate.quad(kern_iMSE, -np.inf, np.inf, args=(u))[0] )
## Computes ( Fhat - F ) ^ 2 for the normal errors case.
def diff_sq_norm(u, res):
return( ( Fhat(np.array([u]), res) - norm.cdf(u) ) ** 2.0 )
## Simulation.
## Number of simulation runs.
runs = 1000
## Sample sizes.
samples = np.array( [100, 200, 500, 1000] )
## u values to check inflated MSE.
useq = np.array( [-1.5, -1.0, 0.0, 1.0, 1.5] )
## Fhat - F values.
Fhat_diff = np.empty( [runs, useq.size] )
## ISE values.
Fhat_ISE = np.empty(runs)
## Inflated bias values.
Fhat_ibias = np.empty( [samples.size, useq.size] )
## Inflated variance values.
Fhat_ivar = np.empty( [samples.size, useq.size] )
## Inflated MSE values.
Fhat_iMSE = np.empty( [samples.size, useq.size] )
## Inflated IMSE values.
Fhat_iIMSE = np.empty(samples.size)
## Simulation.
for n in samples:
## Get the index for n in samples.
n_ind = np.where( samples == n )[0][0]
## Open a progress bar.
bar = FancyBar(message='Sample size: ' + str(n), max=runs)
for s in range(runs):
## Generate covariates.
x1 = 2.0 * ( uniform.rvs(size=n) - 1.0 / 2.0 )
x2 = 2.0 * ( uniform.rvs(size=n) - 1.0 / 2.0 )
X = np.column_stack( (x1, x2) )
## Resposes via model equation.
Y = reg(X) + scl(X) * norm.rvs(size=n)
## Missingness indicators.
Z = deltas( PI(X) )
## Complete cases.
X1 = X[np.where( Z == 1 )[0], :]
Y1 = Y[ np.where( Z == 1 )[0] ]
## A suitable bandwidth for the local linear smoother.
bwd_reg = 3.0 * ( n * np.log(n) ) ** ( -1.0 / 7.0 )
## Complete case fitted values.
fits_CC = P3_fit(X1, X1, Y1, bwd_reg)
## Complete case second moment estimates.
m2_fits_CC = P3_fit(X1, X1, Y1 ** 2.0, bwd_reg)
## Complete case scale estimates.
scl_CC = np.sqrt( np.abs( m2_fits_CC - fits_CC ** 2.0 ) )
## Complete case residuals.
res_CC = ( Y1 - fits_CC ) / scl_CC
## Fhat pointwise differences.
Fhat_diff[s, :] = Fhat(useq, res_CC) - norm.cdf(useq)
## Fhat ISE values.
Fhat_ISE[s] = sp.integrate.quad(diff_sq_norm, -np.inf, np.inf,
args=(res_CC))[0]
bar.next()
## Fhat inflated bias.
Fhat_ibias[n_ind, :] = np.sqrt(n) * np.mean(Fhat_diff, axis=0)
## Fhat inflated variance.
Fhat_ivar[n_ind, :] = n * np.var(Fhat_diff, axis=0)
## Fhat inflated MSE.
Fhat_iMSE[n_ind, :] = Fhat_ibias[n_ind, :] ** 2.0 + Fhat_ivar[n_ind, :]
## Fhat inflated IMSE.
Fhat_iIMSE[n_ind] = n * np.mean(Fhat_ISE)
bar.finish()
## Simulation results.
## Average amount of missing data.
avg_miss = 1.0 / 2.0
## True inflated MSE values.
iMSE_norm = np.empty(useq.size)
for i in range(useq.size):
iMSE_norm[i] = sp.integrate.quad(kern_iMSE, -np.inf, np.inf,
args=(useq[i]))[0] / avg_miss
## True inflated IMSE values.
iIMSE_norm = sp.integrate.quad(kern_iIMSE, -np.inf, np.inf)[0] / avg_miss
## Fhat bias results
bias_results = np.column_stack(
(samples.reshape( (samples.size, 1) ), Fhat_ibias) )
## Fhat variance results.
var_results = np.column_stack(
(samples.reshape( (samples.size, 1) ), Fhat_ivar) )
## Vector of sample sizes plus infinity.
samp_plus_inf = np.append(samples, np.inf).reshape( (samples.size + 1, 1) )
## Fhat inflated MSE with true inflated MSE values.
MSE_vals = np.vstack( (Fhat_iMSE, iMSE_norm) )
## Fhat inflated MSE results.
MSE_results = np.column_stack( (samp_plus_inf, MSE_vals) )
## Fhat inflated IMSE with true inflated IMSE values.
IMSE_vals = np.append(Fhat_iIMSE, iIMSE_norm)
## Fhat inflated IMSE results.
IMSE_results = np.column_stack( (samp_plus_inf, IMSE_vals) )
## Display simulation summary tables.
print( tabulate(bias_results) )
print( tabulate(var_results) )
print( tabulate(MSE_results) )
print( tabulate(IMSE_results) )