Here returns to the code page.
Chown and Müller (2013) investigate the residual-based empirical distribution function in nonparametric regression when responses from this regression model are missing at random.
R code:
################################################################################
## Simulation to demonstrate the root-n consistency of F-hat via the MSE and ##
## MISE for finite samples. ##
################################################################################
## Required libraries.
require(tcltk)
## Set the correct working directory.
setwd("~/Documents/Published/homerrdf/simulations/")
## Custom functions.
## The regression function.
reg = function(x) {
return( x + cos( 3 * pi * x / 2 ) )
}
## Missingness probability function.
PI = function(x) {
return( 1 / ( 1 + exp( -x ) ) )
}
## Generate random missingness indicators.
deltas = function(probs) {
return( ( runif( length(probs) ) > probs ) * 1 )
}
## The cosine kernel function.
w_cos = function(u) {
return( pi * cos( pi * u / 2 ) * ( abs(u) <= 1 ) / 4 )
}
## Computes the matrix of local linear terms.
psi = function(u) {
return( cbind(1, u) )
}
## Computes the local linear estimate at x.
P1_fit_x = function(x, xdata, ydata, bwd) {
## Sample size.
nsamp = length(ydata)
## Transform xdata to localized covariates.
U = ( x - xdata ) / bwd
## Diagonal weight matrix.
W = diag( w_cos(U) )
## n-by-2 matrix of polynomial terms.
Psi = psi(U)
## 2-by-2 Gram matrix for P1 basis.
G = t(Psi) %*% W %*% Psi
## Inner products between ydata and P1.
v = t(Psi) %*% W %*% matrix(ydata, nrow=nsamp)
## Apply regularization only to small samples.
if( nsamp <= 150 ) { G = G + diag(1e-5, 2) }
## Local polynomial coefficients.
betahat = qr.solve(G, v)
## Return the first component.
return( betahat[1] )
}
## Wrapper function to compute local linear estimates with many covariates.
P1_fit = function(xs, xdata, ydata, bwd) {
return( sapply(xs, P1_fit_x, xdata, ydata, bwd) )
}
## Computes Fhat.
Fhat = function(us, res) {
return( sapply(us, function(u, res) { return( mean( res <= u ) ) }, res) )
}
## Computes the influence function for Fhat in the normal errors model.
bnorm = function(us, es) {
return( sapply(us, function(u, es) {
return( 1 * ( es <= u ) - pnorm(u, sd=sig0) +
es * dnorm(u, sd=sig0) ) }, es) )
}
## Computes ( Fhat - F ) ^ 2 for the normal errors case.
diff_sq_norm = function(us, res) {
return( ( Fhat(us, res) - pnorm(us, sd=sig0) ) ^ 2 )
}
## Simulation.
## Number of simulation runs.
runs = 1000
## Error standard deviation.
sig0 = 1
## Sample sizes 50, 100, 200 and 1000.
samples = c(50, 100, 200, 1000)
## t values to check inflated MSE.
tseq = c(-1.5, -1, 0, 1, 1.5)
## Fhat - F values.
Fhat_diff_eff = array(NA, c(runs, length(tseq)))
Fhat_diff_imp = array(NA, c(runs, length(tseq)))
## Simulated ISE values.
Fhat_eff_ISE = rep(NA, runs)
Fhat_imp_ISE = rep(NA, runs)
## Fhat bias values.
Fhat_eff_bias = array(NA, c(length(samples), length(tseq)))
Fhat_imp_bias = array(NA, c(length(samples), length(tseq)))
## Fhat variance values.
Fhat_eff_var = array(NA, c(length(samples), length(tseq)))
Fhat_imp_var = array(NA, c(length(samples), length(tseq)))
## Fhat MSE values.
Fhat_eff_MSE = array(NA, c(length(samples), length(tseq)))
Fhat_imp_MSE = array(NA, c(length(samples), length(tseq)))
## Fhat MISE values.
Fhat_eff_MISE = rep(NA, length(samples))
Fhat_imp_MISE = rep(NA, length(samples))
## Open a progress bar.
pb = tkProgressBar(title="Fhat Simulation", min=0, max=runs)
## Simulation
for(n in samples) {
## Fill progress bar with current information.
setTkProgressBar(pb, 0, label=paste0("Sample size: ", n))
## Set initial time difference to zero.
avg_time_diff = 0
## Get index for n in samples.
nind = which( samples == n )
for(s in 1 : runs) {
## Start a computing timer.
init_time = proc.time()[3]
## Covariates.
X = runif(n, -1, 1)
## Responses generated through model equation.
Y = reg(X) + rnorm(n, 0, sig0)
## Missingness indicators.
Z = deltas( PI(X) )
## Complete cases
X1 = X[ which( Z == 1 ) ]
Y1 = Y[ which( Z == 1 ) ]
## A suitable bandwidth for the regression estimator.
bwd_reg = 1.25 * ( n * log(n) ) ^ ( -1 / 4 )
## Complete case fitted values.
fits_CC = P1_fit(X1, X1, Y1, bwd_reg)
## Complete case residuals.
res_CC = Y1 - fits_CC
## Fully imputed responses.
Y_imp = P1_fit(X, X1, Y1, bwd_reg)
## Fitted values from imputed data.
fits_imp = P1_fit(X1, X, Y_imp, bwd_reg)
## Residuals from imputed fitted values.
res_imp = Y1 - fits_imp
## Fhat pointwise differences.
Fhat_diff_eff[s, ] = Fhat(tseq, res_CC) - pnorm(tseq, sd=sig0)
Fhat_diff_imp[s, ] = Fhat(tseq, res_imp) - pnorm(tseq, sd=sig0)
## Fhat ISE.
Fhat_eff_ISE[s] = integrate(diff_sq_norm, -Inf, Inf, res_CC,
stop.on.error=FALSE)$value
Fhat_imp_ISE[s] = integrate(diff_sq_norm, -Inf, Inf, res_imp,
stop.on.error=FALSE)$value
## One step corrected estimate of mean computing time.
avg_time_diff = ( ( s - 1 ) * avg_time_diff +
proc.time()[3] - init_time ) / s
## Estimated time to finish (in hours).
ETF = round(avg_time_diff * ( runs - s ) / 3600, digits=1)
## Updated progress bar information.
pb_msg = paste0("Sample size: ", n, " (", ETF, " hour(s) to go)")
## Update progress bar.
setTkProgressBar(pb, s, label=pb_msg)
}
## Fhat inflated bias.
Fhat_eff_bias[nind, ] = sqrt(n) * apply(Fhat_diff_eff, 2, mean)
Fhat_imp_bias[nind, ] = sqrt(n) * apply(Fhat_diff_imp, 2, mean)
## Fhat inflated variance.
Fhat_eff_var[nind, ] = n * apply(Fhat_diff_eff, 2, var)
Fhat_imp_var[nind, ] = n * apply(Fhat_diff_imp, 2, var)
## Fhat inflated MSE.
Fhat_eff_MSE[nind, ] = Fhat_eff_bias[nind, ] ^ 2 + Fhat_eff_var[nind, ]
Fhat_imp_MSE[nind, ] = Fhat_imp_bias[nind, ] ^ 2 + Fhat_imp_var[nind, ]
## Fhat inflated MISE.
Fhat_eff_MISE[nind] = n * mean(Fhat_eff_ISE)
Fhat_imp_MISE[nind] = n * mean(Fhat_imp_ISE)
}
## Close the progress bar.
close(pb)
## Simulation results.
## Average amount of missing data.
avg_miss = 1 / 2
## True inflated MSE values for Fhat
iMSE_norm = sapply(tseq, function(u) {
return( integrate(function(v, u) {
return( bnorm(u, v) ^ 2 * dnorm(v, sd=sig0) ) }, -Inf, Inf, u,
stop.on.error=FALSE)$value ) } ) / avg_miss
## Approximate inflated MISE value for Fhat.
iMISE_norm = sum(sapply(seq(-4*sig0, 4*sig0, by=0.0008), function(u) {
return( integrate(function(v, u) {
return( bnorm(u, v) ^ 2 * dnorm(v, sd=sig0) ) }, -Inf, Inf, u,
stop.on.error=FALSE)$value ) } ) ) * 0.0008 / avg_miss
## Fhat bias results.
bias_eff_results = cbind(samples, Fhat_eff_bias)
bias_imp_results = cbind(samples, Fhat_imp_bias)
## Fhat variance results.
var_eff_results = cbind(samples, Fhat_eff_var)
var_imp_results = cbind(samples, Fhat_imp_var)
## Fhat MSE results.
MSE_eff_results = cbind(c(samples, Inf), rbind(Fhat_eff_MSE, iMSE_norm))
MSE_imp_results = cbind(c(samples, Inf), rbind(Fhat_imp_MSE, iMSE_norm))
## Fhat MISE results.
MISE_eff_results = rbind(c(samples, Inf), c(Fhat_eff_MISE, iMISE_norm))
MISE_imp_results = rbind(c(samples, Inf), c(Fhat_imp_MISE, iMISE_norm))
## Display simulation summary tables.
round(bias_eff_results, digits=3)
round(var_eff_results, digits=3)
round(MSE_eff_results, digits=3)
round(MISE_eff_results, digits=3)
round(bias_imp_results, digits=3)
round(var_imp_results, digits=3)
round(MSE_imp_results, digits=3)
round(MISE_imp_results, digits=3)
## Save results to a file.
save(bias_eff_results, var_eff_results,
MSE_eff_results, MISE_eff_results,
bias_imp_results, var_eff_results,
MSE_imp_results, MISE_imp_results,
file="homerrdf_results.RData")
Python code:
###############################################################################
## A python simulation to demonstrate the root-n consistency of F-hat via ##
## the MSE and IMSE for finite samples. ##
###############################################################################
## Required libraries.
import numpy as np
import scipy as sp
from scipy.stats import uniform
from scipy.stats import norm
from progress.bar import Bar
from tabulate import tabulate
## Progress bar class.
class FancyBar(Bar):
suffix='%(percent)d%% - %(ETF).1fh to go.'
@property
def ETF(self):
return self.eta / 3600.0
## Custom functions.
## The regression function.
def reg(x):
return( x + np.cos( 3.0 * np.pi * x / 2.0 ) )
## Missingness probability function.
def PI(x):
return( 1.0 / ( 1.0 + np.exp( -x ) ) )
## Generate random missingness indicators.
def deltas(probs):
return( 1 * ( uniform.rvs(size=probs.size) > probs ) )
## The cosine kernel function.
def w_cos(u):
return( np.pi * np.cos( np.pi * u / 2.0 ) * ( np.abs(u) <= 1.0 ) / 4.0 )
## Computes the matrix of local linear terms.
def psi(u):
return( np.column_stack((np.ones(u.size), u)) )
## Computes the local linear estimate at x.
def P1_fit_x(x, xdata, ydata, bwd):
## Sample size.
nsamp = ydata.size
## Transform xdata to localized covariates.
U = ( x - xdata ) / bwd
## Diagonal weight matrix.
W = np.diag( w_cos(U) )
## n-by-2 matrix of polynomial terms.
Psi = psi(U)
## 2-by-2 Gram matrix for weighted P1 basis.
G = np.matmul(np.matmul(Psi.T, W), Psi)
## Inner products between ydata and P1.
v = np.matmul(np.matmul(Psi.T, W), ydata.reshape( (nsamp, 1) ))
## Apply regularization to small samples.
if nsamp <= 150:
G = G + np.diag( np.ones(2) / 1.0e5 )
## Local polynomial coefficients.
betahat = sp.linalg.solve(G, v, assume_a='pos')
## Return the first component.
return( betahat[0][0] )
## Wrapper function to compute local linear estimates with many covariates.
def P1_fit(xs, xdata, ydata, bwd):
## Estimates.
est = np.empty(xs.size)
## Loop computing predicted values over each element of xs.
for i in range(xs.size):
est[i] = P1_fit_x(xs[i], xdata, ydata, bwd)
## Return the predicted values.
return(est)
## Computes Fhat.
def Fhat(us, res):
## Estimates.
est = np.empty(us.size)
## Compute Fhat for each u value in us.
for i in range(us.size):
est[i] = np.mean( res <= us[i] )
## Return the estimates.
return(est)
## Computes the influence function for Fhat in the normal model.
def b_norm(es, u, sig0):
return( 1.0 * ( es <= u ) - norm.cdf(u, scale=sig0)
+ es * norm.pdf(u, scale=sig0) )
## Integrand of true inflated MSE.
def kern_iMSE(es, u, sig0):
return( b_norm(es, u, sig0) ** 2.0 * norm.pdf(es, scale=sig0) )
## Integrand of the true inflated IMSE.
def kern_iIMSE(u, sig0):
return( sp.integrate.quad(kern_iMSE, -np.inf, np.inf,
args=(u, sig0))[0] )
## Computes ( Fhat - F ) ^ 2 for the normal errors case.
def diff_sq_norm(u, res, sig0):
return( ( Fhat(np.array([u]), res) - norm.cdf(u, scale=sig0) ) ** 2.0 )
## Simulation.
## Number of simulation runs.
runs = 1000
## Sample sizes.
samples = np.array( [50, 100, 200, 1000] )
## Error standard deviation.
sig0 = 1.0
## u values to check inflated MSE.
useq = np.array( [-1.5, -1.0, 0.0, 1.0, 1.5] )
## Fhat - F values.
Fhat_diff_eff = np.empty( [runs, useq.size] )
Fhat_diff_imp = np.empty( [runs, useq.size] )
## ISE values.
Fhat_eff_ISE = np.empty(runs)
Fhat_imp_ISE = np.empty(runs)
## Inflated bias values.
Fhat_eff_ibias = np.empty( [samples.size, useq.size] )
Fhat_imp_ibias = np.empty( [samples.size, useq.size] )
## Inflated variance values.
Fhat_eff_ivar = np.empty( [samples.size, useq.size] )
Fhat_imp_ivar = np.empty( [samples.size, useq.size] )
## Inflated MSE values.
Fhat_eff_iMSE = np.empty( [samples.size, useq.size] )
Fhat_imp_iMSE = np.empty( [samples.size, useq.size] )
## Inflated IMSE values.
Fhat_eff_iIMSE = np.empty(samples.size)
Fhat_imp_iIMSE = np.empty(samples.size)
## Simulation.
for n in samples:
## Get the index for n in samples.
n_ind = np.where( samples == n )[0][0]
## Open a progress bar.
bar = FancyBar(message='Sample size: ' + str(n), max=runs)
for s in range(runs):
## Generate covariates.
X = 2.0 * ( uniform.rvs(size=n) - 1.0 / 2.0 )
## Resposes via model equation.
Y = reg(X) + norm.rvs(size=n, scale=sig0)
## Missingness indicators.
Z = deltas( PI(X) )
## Complete cases.
X1 = X[ np.where( Z == 1 )[0] ]
Y1 = Y[ np.where( Z == 1 )[0] ]
## A suitable bandwidth for the local linear smoother.
bwd_reg = 1.25 * ( n * np.log(n) ) ** ( -1.0 / 4.0 )
## Complete case fitted values.
fits_CC = P1_fit(X1, X1, Y1, bwd_reg)
## Complete case residuals.
res_CC = Y1 - fits_CC
## Fully imputed responses.
Y_imp = P1_fit(X, X1, Y1, bwd_reg)
## Fitted values from imputed data.
fits_imp = P1_fit(X1, X, Y_imp, bwd_reg)
## Residuals from modified fitted values.
res_imp = Y1 - fits_imp
## Fhat pointwise differences.
Fhat_diff_eff[s, :] = Fhat(useq, res_CC) - norm.cdf(useq, scale=sig0)
Fhat_diff_imp[s, :] = Fhat(useq, res_imp) - norm.cdf(useq, scale=sig0)
## Fhat ISE values.
Fhat_eff_ISE[s] = sp.integrate.quad(diff_sq_norm, -np.inf, np.inf,
args=(res_CC, sig0))[0]
Fhat_imp_ISE[s] = sp.integrate.quad(diff_sq_norm, -np.inf, np.inf,
args=(res_imp, sig0))[0]
bar.next()
## Fhat inflated bias.
Fhat_eff_ibias[n_ind, :] = np.sqrt(n) * np.mean(Fhat_diff_eff, axis=0)
Fhat_imp_ibias[n_ind, :] = np.sqrt(n) * np.mean(Fhat_diff_imp, axis=0)
## Fhat inflated variance.
Fhat_eff_ivar[n_ind, :] = n * np.var(Fhat_diff_eff, axis=0)
Fhat_imp_ivar[n_ind, :] = n * np.var(Fhat_diff_imp, axis=0)
## Fhat inflated MSE.
Fhat_eff_iMSE[n_ind, :] = ( Fhat_eff_ibias[n_ind, :] ** 2.0
+ Fhat_eff_ivar[n_ind, :] )
Fhat_imp_iMSE[n_ind, :] = ( Fhat_imp_ibias[n_ind, :] ** 2.0
+ Fhat_imp_ivar[n_ind, :] )
## Fhat inflated IMSE.
Fhat_eff_iIMSE[n_ind] = n * np.mean(Fhat_eff_ISE)
Fhat_imp_iIMSE[n_ind] = n * np.mean(Fhat_imp_ISE)
bar.finish()
## Simulation results.
## Average amount of missing data.
avg_miss = 1.0 / 2.0
## True inflated MSE values.
iMSE_norm = np.empty(useq.size)
for i in range(useq.size):
iMSE_norm[i] = sp.integrate.quad(kern_iMSE, -np.inf, np.inf,
args=(useq[i], sig0))[0] / avg_miss
## True inflated IMSE values.
iIMSE_norm = sp.integrate.quad(kern_iIMSE, -np.inf, np.inf,
args=(sig0))[0] / avg_miss
## Fhat bias results
bias_eff_results = np.column_stack(
(samples.reshape( (samples.size, 1) ), Fhat_eff_ibias) )
bias_imp_results = np.column_stack(
(samples.reshape( (samples.size, 1) ), Fhat_imp_ibias) )
## Fhat variance results.
var_eff_results = np.column_stack(
(samples.reshape( (samples.size, 1) ), Fhat_eff_ivar) )
var_imp_results = np.column_stack(
(samples.reshape( (samples.size, 1) ), Fhat_imp_ivar) )
## Vector of sample sizes plus infinity.
samp_plus_inf = np.append(samples, np.inf).reshape( (samples.size + 1, 1) )
## Fhat inflated MSE with true inflated MSE values.
MSE_eff_vals = np.vstack( (Fhat_eff_iMSE, iMSE_norm) )
MSE_imp_vals = np.vstack( (Fhat_imp_iMSE, iMSE_norm) )
## Fhat inflated MSE results.
MSE_eff_results = np.column_stack( (samp_plus_inf, MSE_eff_vals) )
MSE_imp_results = np.column_stack( (samp_plus_inf, MSE_imp_vals) )
## Fhat inflated IMSE with true inflated IMSE values.
IMSE_eff_vals = np.append(Fhat_eff_iIMSE, iIMSE_norm)
IMSE_eff_vals = IMSE_eff_vals.reshape( (samples.size + 1, 1) )
IMSE_imp_vals = np.append(Fhat_imp_iIMSE, iIMSE_norm)
IMSE_imp_vals = IMSE_imp_vals.reshape( (samples.size + 1, 1) )
## Fhat inflated IMSE results.
IMSE_eff_results = np.column_stack( (samp_plus_inf, IMSE_eff_vals) )
IMSE_imp_results = np.column_stack( (samp_plus_inf, IMSE_imp_vals) )
## Display simulation summary tables.
print( tabulate(bias_eff_results) )
print( tabulate(bias_imp_results) )
print( tabulate(var_eff_results) )
print( tabulate(var_imp_results) )
print( tabulate(MSE_eff_results) )
print( tabulate(MSE_imp_results) )
print( tabulate(IMSE_eff_results) )
print( tabulate(IMSE_imp_results) )