[code:heterrdf]

Here returns to the code page.

Chown (2016) investigates the residual-based empirical distribution function in heteroscedastic nonparametric regression when responses from these models are missing at random.

R code:

################################################################################

## Simulation to demonstrate the root-n consistency of F-hat via the MSE and ##

## MISE for finite samples. ##

################################################################################

## Required libraries.

require(tcltk)

## Set the correct working directory.

setwd("~/Documents/Published/heterrdf/simulations/")

## Custom functions.

## Regression function at x.

reg_x = function(x) {

return( 1 + x[1] - x[2] + 2 * exp( -sqrt( crossprod(x) ) / 2 ) )

}

## Wrapper function to compute regression for many points.

reg = function(xs) {

return( apply(xs, 1, reg_x) )

}

## Scale function at x.

scl_x = function(x) {

return( sqrt( 1 + 2 * crossprod(x) ) )

}

## Wrapper function to compute scale for many points.

scl = function(x) {

return( apply(x, 1, scl_x) )

}

## Missingness probability function at x.

PI_x = function(x) {

return( 1 / ( 1 + exp( -( x[1] + x[2] ) / 2 ) ) )

}

## Wrapper function to compute missingness probability for many points.

PI = function(x) {

return( apply(x, 1, PI_x) )

}

## Generate random missingness indicators.

deltas = function(probs) {

return( ( runif( length(probs) ) > probs ) * 1 )

}

## Computes a product of cosine kernel functions.

w_cos = function(u) {

## Cosine kernel values for first variable.

k1 = pi * cos( pi * u[, 1] / 2 ) * ( abs(u[, 1]) <= 1 ) / 4

## Cosine kernel values for second variable.

k2 = pi * cos( pi * u[, 2] / 2 ) * ( abs(u[, 2]) <= 1 ) / 4

## Return product of kernel values.

return( k1 * k2 )

}

## Computes the matrix of the local cubic terms.

psi = function(u) {

## First variable.

u1 = u[, 1]

## Second variable.

u2 = u[, 2]

## Lexicographic ordering of polynomial terms.

p00 = 1

p01 = u2

p02 = u2 ^ 2 / 2

p03 = u2 ^ 3 / 6

p10 = u1

p11 = u1 * u2

p12 = u1 * u2 ^ 2 / 2

p20 = u1 ^ 2 / 2

p21 = u2 * u1 ^ 2 / 2

p30 = u1 ^ 3 / 6

## Return the psi matrix

return( cbind(p00, p01, p02, p03, p10, p11, p12, p20, p21, p30) )

}

## Computes the local cubic estimate at x.

P3_fit_x = function(x, xdata, ydata, bwd) {

## Sample size.

nsamp = length(ydata)

## Transform xdata to localized covariates.

U = cbind(x[1] - xdata[, 1], x[2] - xdata[, 2]) / bwd

## Diagonal weight matrix.

W = diag( w_cos(U) )

## n-by-10 matrix of polynomial terms.

Psi = psi(U)

## 10-by-10 Gram matrix for P3 basis.

G = t(Psi) %*% W %*% Psi

## Inner products between ydata and P3.

v = t(Psi) %*% W %*% matrix(ydata, nrow=nsamp)

## Apply regularization only to small samples.

if( nsamp <= 150 ) { G = G + diag(1e-6, 10) }

## Local polynomial coefficients.

betahat = qr.solve(G, v)

## Return the first component.

return( betahat[1] )

}

## Wrapper function to compute P3_fit_x for many points.

P3_fit = function(xs, xdata, ydata, bwd) {

return( apply(xs, 1, P3_fit_x, xdata, ydata, bwd) )

}

## Computes Fhat.

Fhat = function(us, res) {

return( sapply(us, function(u, res) { return( mean( res <= u ) ) }, res) )

}

## Computes the influence function for Fhat in the normal errors model.

bnorm = function(us, es) {

return( sapply(us, function(u, es) {

return( 1 * ( es <= u ) - pnorm(u) +

( es + u * ( es ^ 2 - 1 ) / 2 ) * dnorm(u) ) }, es) )

}

## Computes ( Fhat - F ) ^ 2 for the normal errors case.

diff_sq_norm = function(us, res) {

return( ( Fhat(us, res) - pnorm(us) ) ^ 2 )

}

## Simulation.

## Number of simulation runs.

runs = 1000

## Sample sizes 100, 200, 500 and 1000.

samples = c(100, 200, 500, 1000)

## t values to check inflated MSE.

tseq = c(-3, -2, -1, 0)

## Fhat - F values.

Fhat_diff = array(NA, c(runs, length(tseq)))

## Fhat ISE values.

Fhat_ISE = rep(NA, runs)

## Fhat bias values.

Fhat_bias = array(NA, c(length(samples), length(tseq)))

## Fhat variance values.

Fhat_var = array(NA, c(length(samples), length(tseq)))

## Fhat MSE values.

Fhat_MSE = array(NA, c(length(samples), length(tseq)))

## Fhat MISE values.

Fhat_MISE = rep(NA, length(samples))

## Open a progress bar.

pb = tkProgressBar(title="Fhat Simulation", min=0, max=runs)

## Simulation loop.

for(n in samples) {

## Fill progress bar with current information.

setTkProgressBar(pb, 0, label=paste0("Sample size: ", n))

## Set initial time difference to zero.

avg_time_diff = 0

## Get index for n in samples.

nind = which( samples == n )

for(s in 1 : runs) {

## Start a computing timer.

init_time = proc.time()[3]

## Covariates.

X = matrix(runif(2 * n, -1, 1), ncol=2)

## Responses generated through model equation.

Y = reg(X) + scl(X) * rnorm(n)

## Missingness indicators.

Z = deltas( PI(X) )

## Complete cases

X1 = X[which( Z == 1 ), ]

Y1 = Y[ which( Z == 1 ) ]

## A suitable bandwidth.

bwd_reg = 3 * ( n * log(n) ) ^ ( -1 / 7 )

## Complete case fitted values.

fits_CC = P3_fit(X1, X1, Y1, bwd_reg)

## Complete case second conditional moment estimates.

m2_fits_CC = P3_fit(X1, X1, Y1 ^ 2, bwd_reg)

## Complete case scale values.

scl_CC = sqrt( abs( m2_fits_CC - fits_CC ^ 2 ) )

## Complete case standardized residuals.

res_CC = ( Y1 - fits_CC ) / scl_CC

## Fhat pointwise differences.

Fhat_diff[s, ] = Fhat(tseq, res_CC) - pnorm(tseq)

## Fhat ISE.

Fhat_ISE[s] = integrate(diff_sq_norm, -Inf, Inf, res_CC,

stop.on.error=FALSE)$value

## One step corrected estimate of mean computing time.

avg_time_diff = ( ( s - 1 ) * avg_time_diff +

proc.time()[3] - init_time ) / s

## Estimated time to finish.

ETF = round(avg_time_diff * ( runs - s ) / 3600, digits=1)

## Updated progress bar information.

pb_msg = paste0("Sample size: ", n, " (", ETF, " hour(s) to go)")

## Update progress bar.

setTkProgressBar(pb, s, label=pb_msg)

}

## Fhat inflated bias.

Fhat_bias[nind, ] = sqrt(n) * apply(Fhat_diff, 2, mean)

## Fhat inflated variance.

Fhat_var[nind, ] = n * apply(Fhat_diff, 2, var)

## Fhat inflated MSE.

Fhat_MSE[nind, ] = Fhat_bias[nind, ] ^ 2 + Fhat_var[nind, ]

## Fhat inflated MISE.

Fhat_MISE[nind] = n * mean(Fhat_ISE)

}

## Close the progress bar.

close(pb)

## Simulation results.

## Average amount of missing data.

avg_miss = 1 / 2

## True inflated MSE values for Fhat.

iMSE_norm = sapply(tseq, function(u) {

return( integrate(function(v, u) {

return( bnorm(u, v) ^ 2 * dnorm(v) ) }, -Inf, Inf, u,

stop.on.error=FALSE)$value ) } ) / avg_miss

## Approximate inflated MISE value for Fhat.

iMISE_norm = sum( sapply(seq(-4, 4, by=0.0008), function(u) {

return( integrate(function(v, u) {

return( bnorm(u, v) ^ 2 * dnorm(v) ) }, -Inf, Inf, u,

stop.on.error=FALSE)$value ) } ) ) * 0.0008 / avg_miss

## Fhat bias results.

bias_results = cbind(samples, Fhat_bias)

## Fhat variance results.

var_results = cbind(samples, Fhat_var)

## Fhat MSE results.

MSE_results = cbind(c(samples, Inf), rbind(Fhat_MSE, iMSE_norm))

## Fhat MISE results.

MISE_results = rbind(c(samples, Inf), c(Fhat_MISE, iMISE_norm))

## Display simulation summary tables.

round(bias_results, digits=3)

round(var_results, digits=3)

round(MSE_results, digits=3)

round(MISE_results, digits=3)

## Save results to a file.

save(bias_results, var_results, MSE_results, MISE_results,

file="heterrdf_results.RData")

Python code:

###############################################################################

## A python simulation to demonstrate the root-n consistency of F-hat via ##

## the MSE and IMSE for finite samples. ##

###############################################################################

## Required libraries.

import numpy as np

import scipy as sp

from scipy.stats import uniform

from scipy.stats import norm

from progress.bar import Bar

from tabulate import tabulate

## Progress bar class.

class FancyBar(Bar):

suffix='%(percent)d%% - %(ETF).1fh to go.'

@property

def ETF(self):

return self.eta / 3600.0

## Custom functions.

## The regression function at x.

def reg_x(x):

return( 1.0 + x[0] - x[1]

+ 2.0 * np.exp( -np.sqrt( np.dot(x, x) ) / 2.0 ) )

## Wrapper function to compute regression for many points.

def reg(xs):

## Regression values.

vals = np.empty(xs.shape[0])

## For each value in xs compute the regression value.

for i in range(xs.shape[0]):

vals[i] = reg_x(xs[i, :])

## Return the regression values.

return(vals)

## Scale function at x.

def scl_x(x):

return( np.sqrt( 1.0 + 2.0 * np.dot(x, x) ) )

## Wrapper function to compute scale for many points.

def scl(xs):

## Scale values.

vals = np.empty(xs.shape[0])

## For each value in xs compute the scale value.

for i in range(xs.shape[0]):

vals[i] = scl_x(xs[i, :])

## Return the regression values.

return(vals)

## Missingness probability function at x.

def PI_x(x):

return( 1.0 / ( 1.0 + np.exp( -( x[0] + x[1] ) / 2.0 ) ) )

## Wrapper function to compute missingness probability for many points.

def PI(xs):

## Missingness probability values.

vals = np.empty(xs.shape[0])

## For each value in xs compute the missingness probability value.

for i in range(xs.shape[0]):

vals[i] = PI_x(xs[i, :])

## Return the values.

return(vals)

## Generate random missingness indicators.

def deltas(probs):

return( 1 * ( uniform.rvs(size=probs.size) > probs ) )

## Computes a product of cosine kernel functions.

def w_cos(u):

## Cosine kernel values for first variable.

k1 = ( np.pi * np.cos( np.pi * u[:, 0] / 2.0 )

* ( np.abs(u[:, 0]) <= 1 ) / 4.0 )

k2 = ( np.pi * np.cos( np.pi * u[:, 1] / 2.0 )

* ( np.abs(u[:, 1]) <= 1 ) / 4.0 )

## Return product of kernel values.

return( k1 * k2 )

## Computes the matrix of local cubic terms.

def psi(u):

## First variable.

u1 = u[:, 0]

## Second variable.

u2 = u[:, 1]

## Lexicographic ordering of polynomial terms.

p00 = np.ones(u.shape[0])

p01 = u2

p02 = u2 ** 2.0 / 2.0

p03 = u2 ** 3.0 / 6.0

p10 = u1

p11 = u1 * u2

p12 = u1 * u2 ** 2.0 / 2.0

p20 = u1 ** 2.0 / 2.0

p21 = u2 * u1 ** 2.0 / 2.0

p30 = u1 ** 3.0 / 6.0

## Return the psi matrix.

return( np.column_stack((p00, p01, p02, p03, p10, p11, p12,

p20, p21, p30)) )

## Computes the local cubic estimate at x.

def P3_fit_x(x, xdata, ydata, bwd):

## Sample size.

nsamp = ydata.size

## Transform xdata to localized covariates.

U = np.column_stack( (x[0] - xdata[:, 0], x[1] - xdata[:, 1]) ) / bwd

## Diagonal weight matrix.

W = np.diag( w_cos(U) )

## n-by-10 matrix of polynomial terms.

Psi = psi(U)

## 10-by-10 Gram matrix for weighted P3 basis.

G = np.matmul(np.matmul(Psi.T, W), Psi)

## Weighted inner products between ydata and P3.

v = np.matmul(np.matmul(Psi.T, W), ydata.reshape( (nsamp, 1) ))

## Apply regularization to small samples.

if nsamp <= 150:

G = G + np.diag(np.ones(10) / 1.0e6)

## Local polynomial coefficients.

betahat = sp.linalg.solve(G, v, assume_a='pos')

## Return the first component.

return( betahat[0][0] )

## Wrapper function to compute local cubic estimates for many points.

def P3_fit(xs, xdata, ydata, bwd):

## Estimates.

est = np.empty(xs.shape[0])

## For each value of xs calculate the regression estimate.

for i in range(xs.shape[0]):

est[i] = P3_fit_x(xs[i, :], xdata, ydata, bwd)

## Return the estimates.

return(est)

## Computes Fhat.

def Fhat(us, res):

## Estimates.

est = np.empty(us.size)

## Compute Fhat for each u value in us.

for i in range(us.size):

est[i] = np.mean( res <= us[i] )

## Return the estimates.

return(est)

## Computes the influence function for Fhat in the normal model.

def b_norm(es, u):

return( 1.0 * ( es <= u ) - norm.cdf(u)

+ ( es + u * ( es ** 2.0 - 1.0 ) / 2.0 ) * norm.pdf(u) )

## Integrand of true inflated MSE.

def kern_iMSE(es, u):

return( b_norm(es, u) ** 2.0 * norm.pdf(es) )

## Integrand of the true inflated IMSE.

def kern_iIMSE(u):

return( sp.integrate.quad(kern_iMSE, -np.inf, np.inf, args=(u))[0] )

## Computes ( Fhat - F ) ^ 2 for the normal errors case.

def diff_sq_norm(u, res):

return( ( Fhat(np.array([u]), res) - norm.cdf(u) ) ** 2.0 )

## Simulation.

## Number of simulation runs.

runs = 1000

## Sample sizes.

samples = np.array( [100, 200, 500, 1000] )

## u values to check inflated MSE.

useq = np.array( [-1.5, -1.0, 0.0, 1.0, 1.5] )

## Fhat - F values.

Fhat_diff = np.empty( [runs, useq.size] )

## ISE values.

Fhat_ISE = np.empty(runs)

## Inflated bias values.

Fhat_ibias = np.empty( [samples.size, useq.size] )

## Inflated variance values.

Fhat_ivar = np.empty( [samples.size, useq.size] )

## Inflated MSE values.

Fhat_iMSE = np.empty( [samples.size, useq.size] )

## Inflated IMSE values.

Fhat_iIMSE = np.empty(samples.size)

## Simulation.

for n in samples:

## Get the index for n in samples.

n_ind = np.where( samples == n )[0][0]

## Open a progress bar.

bar = FancyBar(message='Sample size: ' + str(n), max=runs)

for s in range(runs):

## Generate covariates.

x1 = 2.0 * ( uniform.rvs(size=n) - 1.0 / 2.0 )

x2 = 2.0 * ( uniform.rvs(size=n) - 1.0 / 2.0 )

X = np.column_stack( (x1, x2) )

## Resposes via model equation.

Y = reg(X) + scl(X) * norm.rvs(size=n)

## Missingness indicators.

Z = deltas( PI(X) )

## Complete cases.

X1 = X[np.where( Z == 1 )[0], :]

Y1 = Y[ np.where( Z == 1 )[0] ]

## A suitable bandwidth for the local linear smoother.

bwd_reg = 3.0 * ( n * np.log(n) ) ** ( -1.0 / 7.0 )

## Complete case fitted values.

fits_CC = P3_fit(X1, X1, Y1, bwd_reg)

## Complete case second moment estimates.

m2_fits_CC = P3_fit(X1, X1, Y1 ** 2.0, bwd_reg)

## Complete case scale estimates.

scl_CC = np.sqrt( np.abs( m2_fits_CC - fits_CC ** 2.0 ) )

## Complete case residuals.

res_CC = ( Y1 - fits_CC ) / scl_CC

## Fhat pointwise differences.

Fhat_diff[s, :] = Fhat(useq, res_CC) - norm.cdf(useq)

## Fhat ISE values.

Fhat_ISE[s] = sp.integrate.quad(diff_sq_norm, -np.inf, np.inf,

args=(res_CC))[0]

bar.next()

## Fhat inflated bias.

Fhat_ibias[n_ind, :] = np.sqrt(n) * np.mean(Fhat_diff, axis=0)

## Fhat inflated variance.

Fhat_ivar[n_ind, :] = n * np.var(Fhat_diff, axis=0)

## Fhat inflated MSE.

Fhat_iMSE[n_ind, :] = Fhat_ibias[n_ind, :] ** 2.0 + Fhat_ivar[n_ind, :]

## Fhat inflated IMSE.

Fhat_iIMSE[n_ind] = n * np.mean(Fhat_ISE)

bar.finish()

## Simulation results.

## Average amount of missing data.

avg_miss = 1.0 / 2.0

## True inflated MSE values.

iMSE_norm = np.empty(useq.size)

for i in range(useq.size):

iMSE_norm[i] = sp.integrate.quad(kern_iMSE, -np.inf, np.inf,

args=(useq[i]))[0] / avg_miss

## True inflated IMSE values.

iIMSE_norm = sp.integrate.quad(kern_iIMSE, -np.inf, np.inf)[0] / avg_miss

## Fhat bias results

bias_results = np.column_stack(

(samples.reshape( (samples.size, 1) ), Fhat_ibias) )

## Fhat variance results.

var_results = np.column_stack(

(samples.reshape( (samples.size, 1) ), Fhat_ivar) )

## Vector of sample sizes plus infinity.

samp_plus_inf = np.append(samples, np.inf).reshape( (samples.size + 1, 1) )

## Fhat inflated MSE with true inflated MSE values.

MSE_vals = np.vstack( (Fhat_iMSE, iMSE_norm) )

## Fhat inflated MSE results.

MSE_results = np.column_stack( (samp_plus_inf, MSE_vals) )

## Fhat inflated IMSE with true inflated IMSE values.

IMSE_vals = np.append(Fhat_iIMSE, iIMSE_norm)

## Fhat inflated IMSE results.

IMSE_results = np.column_stack( (samp_plus_inf, IMSE_vals) )

## Display simulation summary tables.

print( tabulate(bias_results) )

print( tabulate(var_results) )

print( tabulate(MSE_results) )

print( tabulate(IMSE_results) )

Google Sites

Report abuse