[code:homerrdf]

Here returns to the code page.

Chown and Müller (2013) investigate the residual-based empirical distribution function in nonparametric regression when responses from this regression model are missing at random.

R code:

################################################################################

## Simulation to demonstrate the root-n consistency of F-hat via the MSE and ##

## MISE for finite samples. ##

################################################################################

## Required libraries.

require(tcltk)

## Set the correct working directory.

setwd("~/Documents/Published/homerrdf/simulations/")

## Custom functions.

## The regression function.

reg = function(x) {

return( x + cos( 3 * pi * x / 2 ) )

}

## Missingness probability function.

PI = function(x) {

return( 1 / ( 1 + exp( -x ) ) )

}

## Generate random missingness indicators.

deltas = function(probs) {

return( ( runif( length(probs) ) > probs ) * 1 )

}

## The cosine kernel function.

w_cos = function(u) {

return( pi * cos( pi * u / 2 ) * ( abs(u) <= 1 ) / 4 )

}

## Computes the matrix of local linear terms.

psi = function(u) {

return( cbind(1, u) )

}

## Computes the local linear estimate at x.

P1_fit_x = function(x, xdata, ydata, bwd) {

## Sample size.

nsamp = length(ydata)

## Transform xdata to localized covariates.

U = ( x - xdata ) / bwd

## Diagonal weight matrix.

W = diag( w_cos(U) )

## n-by-2 matrix of polynomial terms.

Psi = psi(U)

## 2-by-2 Gram matrix for P1 basis.

G = t(Psi) %*% W %*% Psi

## Inner products between ydata and P1.

v = t(Psi) %*% W %*% matrix(ydata, nrow=nsamp)

## Apply regularization only to small samples.

if( nsamp <= 150 ) { G = G + diag(1e-5, 2) }

## Local polynomial coefficients.

betahat = qr.solve(G, v)

## Return the first component.

return( betahat[1] )

}

## Wrapper function to compute local linear estimates with many covariates.

P1_fit = function(xs, xdata, ydata, bwd) {

return( sapply(xs, P1_fit_x, xdata, ydata, bwd) )

}

## Computes Fhat.

Fhat = function(us, res) {

return( sapply(us, function(u, res) { return( mean( res <= u ) ) }, res) )

}

## Computes the influence function for Fhat in the normal errors model.

bnorm = function(us, es) {

return( sapply(us, function(u, es) {

return( 1 * ( es <= u ) - pnorm(u, sd=sig0) +

es * dnorm(u, sd=sig0) ) }, es) )

}

## Computes ( Fhat - F ) ^ 2 for the normal errors case.

diff_sq_norm = function(us, res) {

return( ( Fhat(us, res) - pnorm(us, sd=sig0) ) ^ 2 )

}

## Simulation.

## Number of simulation runs.

runs = 1000

## Error standard deviation.

sig0 = 1

## Sample sizes 50, 100, 200 and 1000.

samples = c(50, 100, 200, 1000)

## t values to check inflated MSE.

tseq = c(-1.5, -1, 0, 1, 1.5)

## Fhat - F values.

Fhat_diff_eff = array(NA, c(runs, length(tseq)))

Fhat_diff_imp = array(NA, c(runs, length(tseq)))

## Simulated ISE values.

Fhat_eff_ISE = rep(NA, runs)

Fhat_imp_ISE = rep(NA, runs)

## Fhat bias values.

Fhat_eff_bias = array(NA, c(length(samples), length(tseq)))

Fhat_imp_bias = array(NA, c(length(samples), length(tseq)))

## Fhat variance values.

Fhat_eff_var = array(NA, c(length(samples), length(tseq)))

Fhat_imp_var = array(NA, c(length(samples), length(tseq)))

## Fhat MSE values.

Fhat_eff_MSE = array(NA, c(length(samples), length(tseq)))

Fhat_imp_MSE = array(NA, c(length(samples), length(tseq)))

## Fhat MISE values.

Fhat_eff_MISE = rep(NA, length(samples))

Fhat_imp_MISE = rep(NA, length(samples))

## Open a progress bar.

pb = tkProgressBar(title="Fhat Simulation", min=0, max=runs)

## Simulation

for(n in samples) {

## Fill progress bar with current information.

setTkProgressBar(pb, 0, label=paste0("Sample size: ", n))

## Set initial time difference to zero.

avg_time_diff = 0

## Get index for n in samples.

nind = which( samples == n )

for(s in 1 : runs) {

## Start a computing timer.

init_time = proc.time()[3]

## Covariates.

X = runif(n, -1, 1)

## Responses generated through model equation.

Y = reg(X) + rnorm(n, 0, sig0)

## Missingness indicators.

Z = deltas( PI(X) )

## Complete cases

X1 = X[ which( Z == 1 ) ]

Y1 = Y[ which( Z == 1 ) ]

## A suitable bandwidth for the regression estimator.

bwd_reg = 1.25 * ( n * log(n) ) ^ ( -1 / 4 )

## Complete case fitted values.

fits_CC = P1_fit(X1, X1, Y1, bwd_reg)

## Complete case residuals.

res_CC = Y1 - fits_CC

## Fully imputed responses.

Y_imp = P1_fit(X, X1, Y1, bwd_reg)

## Fitted values from imputed data.

fits_imp = P1_fit(X1, X, Y_imp, bwd_reg)

## Residuals from imputed fitted values.

res_imp = Y1 - fits_imp

## Fhat pointwise differences.

Fhat_diff_eff[s, ] = Fhat(tseq, res_CC) - pnorm(tseq, sd=sig0)

Fhat_diff_imp[s, ] = Fhat(tseq, res_imp) - pnorm(tseq, sd=sig0)

## Fhat ISE.

Fhat_eff_ISE[s] = integrate(diff_sq_norm, -Inf, Inf, res_CC,

stop.on.error=FALSE)$value

Fhat_imp_ISE[s] = integrate(diff_sq_norm, -Inf, Inf, res_imp,

stop.on.error=FALSE)$value

## One step corrected estimate of mean computing time.

avg_time_diff = ( ( s - 1 ) * avg_time_diff +

proc.time()[3] - init_time ) / s

## Estimated time to finish (in hours).

ETF = round(avg_time_diff * ( runs - s ) / 3600, digits=1)

## Updated progress bar information.

pb_msg = paste0("Sample size: ", n, " (", ETF, " hour(s) to go)")

## Update progress bar.

setTkProgressBar(pb, s, label=pb_msg)

}

## Fhat inflated bias.

Fhat_eff_bias[nind, ] = sqrt(n) * apply(Fhat_diff_eff, 2, mean)

Fhat_imp_bias[nind, ] = sqrt(n) * apply(Fhat_diff_imp, 2, mean)

## Fhat inflated variance.

Fhat_eff_var[nind, ] = n * apply(Fhat_diff_eff, 2, var)

Fhat_imp_var[nind, ] = n * apply(Fhat_diff_imp, 2, var)

## Fhat inflated MSE.

Fhat_eff_MSE[nind, ] = Fhat_eff_bias[nind, ] ^ 2 + Fhat_eff_var[nind, ]

Fhat_imp_MSE[nind, ] = Fhat_imp_bias[nind, ] ^ 2 + Fhat_imp_var[nind, ]

## Fhat inflated MISE.

Fhat_eff_MISE[nind] = n * mean(Fhat_eff_ISE)

Fhat_imp_MISE[nind] = n * mean(Fhat_imp_ISE)

}

## Close the progress bar.

close(pb)

## Simulation results.

## Average amount of missing data.

avg_miss = 1 / 2

## True inflated MSE values for Fhat

iMSE_norm = sapply(tseq, function(u) {

return( integrate(function(v, u) {

return( bnorm(u, v) ^ 2 * dnorm(v, sd=sig0) ) }, -Inf, Inf, u,

stop.on.error=FALSE)$value ) } ) / avg_miss

## Approximate inflated MISE value for Fhat.

iMISE_norm = sum(sapply(seq(-4*sig0, 4*sig0, by=0.0008), function(u) {

return( integrate(function(v, u) {

return( bnorm(u, v) ^ 2 * dnorm(v, sd=sig0) ) }, -Inf, Inf, u,

stop.on.error=FALSE)$value ) } ) ) * 0.0008 / avg_miss

## Fhat bias results.

bias_eff_results = cbind(samples, Fhat_eff_bias)

bias_imp_results = cbind(samples, Fhat_imp_bias)

## Fhat variance results.

var_eff_results = cbind(samples, Fhat_eff_var)

var_imp_results = cbind(samples, Fhat_imp_var)

## Fhat MSE results.

MSE_eff_results = cbind(c(samples, Inf), rbind(Fhat_eff_MSE, iMSE_norm))

MSE_imp_results = cbind(c(samples, Inf), rbind(Fhat_imp_MSE, iMSE_norm))

## Fhat MISE results.

MISE_eff_results = rbind(c(samples, Inf), c(Fhat_eff_MISE, iMISE_norm))

MISE_imp_results = rbind(c(samples, Inf), c(Fhat_imp_MISE, iMISE_norm))

## Display simulation summary tables.

round(bias_eff_results, digits=3)

round(var_eff_results, digits=3)

round(MSE_eff_results, digits=3)

round(MISE_eff_results, digits=3)

round(bias_imp_results, digits=3)

round(var_imp_results, digits=3)

round(MSE_imp_results, digits=3)

round(MISE_imp_results, digits=3)

## Save results to a file.

save(bias_eff_results, var_eff_results,

MSE_eff_results, MISE_eff_results,

bias_imp_results, var_eff_results,

MSE_imp_results, MISE_imp_results,

file="homerrdf_results.RData")

Python code:

###############################################################################

## A python simulation to demonstrate the root-n consistency of F-hat via ##

## the MSE and IMSE for finite samples. ##

###############################################################################

## Required libraries.

import numpy as np

import scipy as sp

from scipy.stats import uniform

from scipy.stats import norm

from progress.bar import Bar

from tabulate import tabulate

## Progress bar class.

class FancyBar(Bar):

suffix='%(percent)d%% - %(ETF).1fh to go.'

@property

def ETF(self):

return self.eta / 3600.0

## Custom functions.

## The regression function.

def reg(x):

return( x + np.cos( 3.0 * np.pi * x / 2.0 ) )

## Missingness probability function.

def PI(x):

return( 1.0 / ( 1.0 + np.exp( -x ) ) )

## Generate random missingness indicators.

def deltas(probs):

return( 1 * ( uniform.rvs(size=probs.size) > probs ) )

## The cosine kernel function.

def w_cos(u):

return( np.pi * np.cos( np.pi * u / 2.0 ) * ( np.abs(u) <= 1.0 ) / 4.0 )

## Computes the matrix of local linear terms.

def psi(u):

return( np.column_stack((np.ones(u.size), u)) )

## Computes the local linear estimate at x.

def P1_fit_x(x, xdata, ydata, bwd):

## Sample size.

nsamp = ydata.size

## Transform xdata to localized covariates.

U = ( x - xdata ) / bwd

## Diagonal weight matrix.

W = np.diag( w_cos(U) )

## n-by-2 matrix of polynomial terms.

Psi = psi(U)

## 2-by-2 Gram matrix for weighted P1 basis.

G = np.matmul(np.matmul(Psi.T, W), Psi)

## Inner products between ydata and P1.

v = np.matmul(np.matmul(Psi.T, W), ydata.reshape( (nsamp, 1) ))

## Apply regularization to small samples.

if nsamp <= 150:

G = G + np.diag( np.ones(2) / 1.0e5 )

## Local polynomial coefficients.

betahat = sp.linalg.solve(G, v, assume_a='pos')

## Return the first component.

return( betahat[0][0] )

## Wrapper function to compute local linear estimates with many covariates.

def P1_fit(xs, xdata, ydata, bwd):

## Estimates.

est = np.empty(xs.size)

## Loop computing predicted values over each element of xs.

for i in range(xs.size):

est[i] = P1_fit_x(xs[i], xdata, ydata, bwd)

## Return the predicted values.

return(est)

## Computes Fhat.

def Fhat(us, res):

## Estimates.

est = np.empty(us.size)

## Compute Fhat for each u value in us.

for i in range(us.size):

est[i] = np.mean( res <= us[i] )

## Return the estimates.

return(est)

## Computes the influence function for Fhat in the normal model.

def b_norm(es, u, sig0):

return( 1.0 * ( es <= u ) - norm.cdf(u, scale=sig0)

+ es * norm.pdf(u, scale=sig0) )

## Integrand of true inflated MSE.

def kern_iMSE(es, u, sig0):

return( b_norm(es, u, sig0) ** 2.0 * norm.pdf(es, scale=sig0) )

## Integrand of the true inflated IMSE.

def kern_iIMSE(u, sig0):

return( sp.integrate.quad(kern_iMSE, -np.inf, np.inf,

args=(u, sig0))[0] )

## Computes ( Fhat - F ) ^ 2 for the normal errors case.

def diff_sq_norm(u, res, sig0):

return( ( Fhat(np.array([u]), res) - norm.cdf(u, scale=sig0) ) ** 2.0 )

## Simulation.

## Number of simulation runs.

runs = 1000

## Sample sizes.

samples = np.array( [50, 100, 200, 1000] )

## Error standard deviation.

sig0 = 1.0

## u values to check inflated MSE.

useq = np.array( [-1.5, -1.0, 0.0, 1.0, 1.5] )

## Fhat - F values.

Fhat_diff_eff = np.empty( [runs, useq.size] )

Fhat_diff_imp = np.empty( [runs, useq.size] )

## ISE values.

Fhat_eff_ISE = np.empty(runs)

Fhat_imp_ISE = np.empty(runs)

## Inflated bias values.

Fhat_eff_ibias = np.empty( [samples.size, useq.size] )

Fhat_imp_ibias = np.empty( [samples.size, useq.size] )

## Inflated variance values.

Fhat_eff_ivar = np.empty( [samples.size, useq.size] )

Fhat_imp_ivar = np.empty( [samples.size, useq.size] )

## Inflated MSE values.

Fhat_eff_iMSE = np.empty( [samples.size, useq.size] )

Fhat_imp_iMSE = np.empty( [samples.size, useq.size] )

## Inflated IMSE values.

Fhat_eff_iIMSE = np.empty(samples.size)

Fhat_imp_iIMSE = np.empty(samples.size)

## Simulation.

for n in samples:

## Get the index for n in samples.

n_ind = np.where( samples == n )[0][0]

## Open a progress bar.

bar = FancyBar(message='Sample size: ' + str(n), max=runs)

for s in range(runs):

## Generate covariates.

X = 2.0 * ( uniform.rvs(size=n) - 1.0 / 2.0 )

## Resposes via model equation.

Y = reg(X) + norm.rvs(size=n, scale=sig0)

## Missingness indicators.

Z = deltas( PI(X) )

## Complete cases.

X1 = X[ np.where( Z == 1 )[0] ]

Y1 = Y[ np.where( Z == 1 )[0] ]

## A suitable bandwidth for the local linear smoother.

bwd_reg = 1.25 * ( n * np.log(n) ) ** ( -1.0 / 4.0 )

## Complete case fitted values.

fits_CC = P1_fit(X1, X1, Y1, bwd_reg)

## Complete case residuals.

res_CC = Y1 - fits_CC

## Fully imputed responses.

Y_imp = P1_fit(X, X1, Y1, bwd_reg)

## Fitted values from imputed data.

fits_imp = P1_fit(X1, X, Y_imp, bwd_reg)

## Residuals from modified fitted values.

res_imp = Y1 - fits_imp

## Fhat pointwise differences.

Fhat_diff_eff[s, :] = Fhat(useq, res_CC) - norm.cdf(useq, scale=sig0)

Fhat_diff_imp[s, :] = Fhat(useq, res_imp) - norm.cdf(useq, scale=sig0)

## Fhat ISE values.

Fhat_eff_ISE[s] = sp.integrate.quad(diff_sq_norm, -np.inf, np.inf,

args=(res_CC, sig0))[0]

Fhat_imp_ISE[s] = sp.integrate.quad(diff_sq_norm, -np.inf, np.inf,

args=(res_imp, sig0))[0]

bar.next()

## Fhat inflated bias.

Fhat_eff_ibias[n_ind, :] = np.sqrt(n) * np.mean(Fhat_diff_eff, axis=0)

Fhat_imp_ibias[n_ind, :] = np.sqrt(n) * np.mean(Fhat_diff_imp, axis=0)

## Fhat inflated variance.

Fhat_eff_ivar[n_ind, :] = n * np.var(Fhat_diff_eff, axis=0)

Fhat_imp_ivar[n_ind, :] = n * np.var(Fhat_diff_imp, axis=0)

## Fhat inflated MSE.

Fhat_eff_iMSE[n_ind, :] = ( Fhat_eff_ibias[n_ind, :] ** 2.0

+ Fhat_eff_ivar[n_ind, :] )

Fhat_imp_iMSE[n_ind, :] = ( Fhat_imp_ibias[n_ind, :] ** 2.0

+ Fhat_imp_ivar[n_ind, :] )

## Fhat inflated IMSE.

Fhat_eff_iIMSE[n_ind] = n * np.mean(Fhat_eff_ISE)

Fhat_imp_iIMSE[n_ind] = n * np.mean(Fhat_imp_ISE)

bar.finish()

## Simulation results.

## Average amount of missing data.

avg_miss = 1.0 / 2.0

## True inflated MSE values.

iMSE_norm = np.empty(useq.size)

for i in range(useq.size):

iMSE_norm[i] = sp.integrate.quad(kern_iMSE, -np.inf, np.inf,

args=(useq[i], sig0))[0] / avg_miss

## True inflated IMSE values.

iIMSE_norm = sp.integrate.quad(kern_iIMSE, -np.inf, np.inf,

args=(sig0))[0] / avg_miss

## Fhat bias results

bias_eff_results = np.column_stack(

(samples.reshape( (samples.size, 1) ), Fhat_eff_ibias) )

bias_imp_results = np.column_stack(

(samples.reshape( (samples.size, 1) ), Fhat_imp_ibias) )

## Fhat variance results.

var_eff_results = np.column_stack(

(samples.reshape( (samples.size, 1) ), Fhat_eff_ivar) )

var_imp_results = np.column_stack(

(samples.reshape( (samples.size, 1) ), Fhat_imp_ivar) )

## Vector of sample sizes plus infinity.

samp_plus_inf = np.append(samples, np.inf).reshape( (samples.size + 1, 1) )

## Fhat inflated MSE with true inflated MSE values.

MSE_eff_vals = np.vstack( (Fhat_eff_iMSE, iMSE_norm) )

MSE_imp_vals = np.vstack( (Fhat_imp_iMSE, iMSE_norm) )

## Fhat inflated MSE results.

MSE_eff_results = np.column_stack( (samp_plus_inf, MSE_eff_vals) )

MSE_imp_results = np.column_stack( (samp_plus_inf, MSE_imp_vals) )

## Fhat inflated IMSE with true inflated IMSE values.

IMSE_eff_vals = np.append(Fhat_eff_iIMSE, iIMSE_norm)

IMSE_eff_vals = IMSE_eff_vals.reshape( (samples.size + 1, 1) )

IMSE_imp_vals = np.append(Fhat_imp_iIMSE, iIMSE_norm)

IMSE_imp_vals = IMSE_imp_vals.reshape( (samples.size + 1, 1) )

## Fhat inflated IMSE results.

IMSE_eff_results = np.column_stack( (samp_plus_inf, IMSE_eff_vals) )

IMSE_imp_results = np.column_stack( (samp_plus_inf, IMSE_imp_vals) )

## Display simulation summary tables.

print( tabulate(bias_eff_results) )

print( tabulate(bias_imp_results) )

print( tabulate(var_eff_results) )

print( tabulate(var_imp_results) )

print( tabulate(MSE_eff_results) )

print( tabulate(MSE_imp_results) )

print( tabulate(IMSE_eff_results) )

print( tabulate(IMSE_imp_results) )

Google Sites

Report abuse