The following is an R script that exercises and checks the confidence boxes for the continuous uniform distribution which is parameterized either by specifying its minimum and maximum or by specifying its midpoint and width. This script was created as a part of our quality assurance efforts for c-box software. You can run the script and see its output by sequentially pasting its sections (which are delimited by rows of pound signs #) into the R console.
#
# As always, you should be able to copy the content between the rows of pound signs
# and paste them into R to replicate simulations and see the graphical results, or
# you can turn on History/Recording for the plot window, then paste the entire
# contents of this file into R and use PageUp and PageDown to see the output. The
# results depend on stochastic simulations, so it is useful to run them a few times
# to observe the range of variation in the outputs.
#
####################################################################################
# Below are the R functions implementing c-boxes for uniform distributions assuming
# x[i] ~ uniform(minimum, maximum), or
# x[i] ~ uniform(midpoint, width).
nextvalue.uniform <- function(x) {
w=(max(x)-min(x))/beta(length(x)-1,2);
m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1);
return(uniform(m-w/2, m+w/2))
}
parameter.uniform.minimum <- function(x) {
w=(max(x)-min(x))/beta(length(x)-1,2);
m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1);
return(m-w/2)
}
parameter.uniform.maximum <- function(x) {
w=(max(x)-min(x))/beta(length(x)-1,2);
m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1);
return(m+w/2)
}
parameter.uniform.midpoint <- function(x) {
w=(max(x)-min(x))/beta(length(x)-1,2);
return((max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1))
}
parameter.uniform.width <- function(x) return((max(x)-min(x))/beta(length(x)-1,2))
uniform <- function(a,b) runif(many, a,b)
# ancillary functions
many = 2000 # increase for more accuracy
beta <- function(v,w) if (v==0) rep(0,many) else if (w==0) rep(1,many) else rbeta(many, v, w)
env <- function(x,y) c(x,y)
edf <- function(x, ...) lines(c(min(x),sort(x)),seq(0,1,length.out=1+length(x)),type='s', ...)
plotbox <- function(b,new=TRUE,col='blue',lwd=2,xlim=range(b[is.finite(b)]),ylim=c(0,1),xlab='',ylab='Prob',...) {
edf <- function (x, col, lwd, ...) {
n <- length(x)
s <- sort(x)
lines(c(s[[1]],s[[1]]),c(0,1/n),lwd=lwd,col=col,...)
for (i in 2:n) lines(c(s[[i-1]],s[[i]],s[[i]]),c(i-1,i-1,i)/n,col=col,lwd=lwd,...)
}
b = ifelse(b==-Inf, xlim[1] - 10, b)
b = ifelse(b==Inf, xlim[2] + 10, b)
if (new) plot(NULL, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab)
if (length(b) < many) edf(b,col,lwd) else
edf(c(min(b),max(b),b[1:min(length(b),many)]),col,lwd)
if (many < length(b)) edf(c(min(b),max(b),b[(many+1):length(b)]),col,lwd)
}
scalar = function(s,...) abline(v=s,...)
# example
par(mfrow=c(2,1))
n = 8
a = 0
b = 10
width = (b - a)
midpoint = (a + b) / 2
x = runif(n,a,b)
L = -10
w = parameter.uniform.width(x)
m = parameter.uniform.midpoint(x)
plot(NULL,xlim=c(L,20),ylim=c(0,1))
plotbox(w,FALSE); scalar(midpoint,col='red',lwd=2); text(L, 0.9, 'midpoint',col='red')
plotbox(m,FALSE,col='red'); scalar(width,col='blue',lwd=2); text(L, 0.75, 'width',col='blue')
m = parameter.uniform.minimum(x)
M = parameter.uniform.maximum(x)
plot(NULL,xlim=c(L,20),ylim=c(0,1))
edf(x,lwd=3)
edf(m,lwd=3,col=3); scalar(a,col=3,lwd=2); text(L,0.9,'minimum',col=3)
edf(M,lwd=3,col=6); scalar(b,col=6,lwd=2); text(L,0.75,'maximum',col=6)
scalar(max(x),col='gray',lty=3)
scalar(min(x),col='gray',lty=3)
# The colored c-boxes are estimating the true parameter values which are
# depicted as the vertical lines of the same color.
# Notice that the estimates of width and maximum are different, even when
# the minimum a=0. For instance, the blue c-box for width does not start
# at max(x) like the magenta c-box for the maximum.
#########################################################################
# comparison with frequentist MLE, MVUE and MMSEE estimators for maximum
if (a!=0) cat('Formulas for MLE, MVUE, MMSEE estimates are wrong\n') else {
t = 10:1000/100
t[t<max(x)] = 0
#plot(t,log(1/(t^some)), type='l')
MLE = max(x)
MVUE = ((n+1)/n)*max(x)
MMSEE = ((n+2)/(n+1))*max(x)
scalar(MLE,col='gray',lwd=3)
scalar(MVUE,col='gray',lwd=3)
scalar(MMSEE,col='gray',lwd=3)
cat('MLE',MLE,'\nMVUE',MVUE,'\nMMSEE', MMSEE,'\n')
}
# If shown, the gray vertical lines depict several frequentist estimators
# for the maximum.
#########################################################################
# repeated sampling plots for MINIMUM and MAXIMUM
shomid = TRUE
showid = TRUE
par(mfrow=c(5,5))
nn = round(runif(5*5,2,25))
mm = c(-20,-1, 0, 1, 20)
ww = c(0.01, 0.1, 1, 5, 10)
for (i in 1:5) for (j in 1:5) {
a = mm[[i]] - ww[[j]]/2 # true minimum, a distribution parameter
b = mm[[i]] + ww[[j]]/2 # true maximum, a distribution parameter
n = nn[[(i-1)*5 + j]] # sample size
if (shomid & showid)
xlim=c(0,1.5*ww[[j]], mm[[i]]-ww[[j]], mm[[i]]+ww[[j]]) else
if (shomid)
xlim=c(mm[[i]]-ww[[j]], mm[[i]]+ww[[j]]) else
if (showid)
xlim=c(0,1.5*ww[[j]]) else
cat('Nothing to show\n')
plot(NULL,xlim=range(xlim),ylim=c(0,1))
if (shomid) lines(rep(mm[[i]],2),c(0,1),col='red',lwd=3)
if (showid) lines(rep(ww[[j]],2),c(0,1),col='blue',lwd=3)
title(paste(n,'~U(',a,',',b,')'))
for (k in 1:10) {
x = runif(n,a,b) # some sample data
m = parameter.uniform.midpoint(x)
w = parameter.uniform.width(x)
if (shomid) edf(m,lwd=2)
if (showid) edf(w,col='gray',lwd=2)
}
if (shomid) lines(rep(mm[[i]],2),c(0,1),col='red',lwd=1)
if (showid) lines(rep(ww[[j]],2),c(0,1),col='blue',lwd=1)
}
# midpoint is blue, estimated by gray c-boxes
# width is red, estimated by black c-boxes
# the estimates are often close to perfect
#########################################################################
# repeated sampling plots for NEXTVALUE
par(mfrow=c(3,3))
nn = c(2,3,4,5,7,10,15,20,200);
a = 1
b = 3
for (n in nn) {
plot(c(a,b),c(0,1), xlim=c(a-(a+b)/2,b+(a+b)/2),ylim=c(0,1),type='l',col='red')
title(paste('n =',n))
for (i in 1:500) {
x = runif(n,a,b)
nv = nextvalue.uniform(x)
edf(nv)
}
lines(c(a,b),c(0,1), col='red',lwd=3)
}
#########################################################################
# repeated sampling plots for NEXTVALUE for just TWO SAMPLE POINTS
par(mfrow=c(1,1))
n = 2
a = 1
b = 3
many=10000
plot(c(a,b),c(0,1), xlim=c(a-(a+b)/2,b+(a+b)/2),ylim=c(0,1),type='l',col='red')
title(paste('n =',n))
for (i in 1:100) {
x = runif(n,a,b)
nv = nextvalue.uniform(x)
edf(nv)
}
lines(c(a,b),c(0,1), col='red',lwd=3)
#########################################################################
# the data sets yielding the extremes are obvious
many=10000
plot(c(a,b),c(0,1), xlim=c(a-(a+b)/2,b+(a+b)/2),ylim=c(0,1),type='l',col='red')
title(paste('n =',n))
for (i in 1:1000) {
x = runif(n,a,b)
nv = nextvalue.uniform(x)
edf(nv)
}
lines(c(a,b),c(0,1), col='red',lwd=3)
x = range(a,b)
nv = nextvalue.uniform(x)
edf(nv, col='magenta', lwd=3)
x = range(a,a)
nv = nextvalue.uniform(x)
edf(nv, col='green', lwd=3)
x = range(b,b)
nv = nextvalue.uniform(x)
edf(nv, col='blue', lwd=3)
#W = 1
#for (i in seq(a,b,length.out=30)) {
# x = range(i, b)
# nv = nextvalue.uniform(x)
# edf(nv, col='yellow', lwd=3)
# msg = paste('x = c(',signif(i,3),',',b,')')
# text(0,0.5,msg)
# Sys.sleep(W)
# text(0,0.5,msg,col='white')
# }
# green x = c(1,1)
# magenta x = c(1,3)
# blue x = c(3,3)
# black x = runif(2,1,3)
# red the true distribution
# if any c-box is above the magenta c-box on the left or below it on the right,
# increase the constant 'many' so the approximation is better
#########################################################################
# Michael's email seems to suggest this formulation for the Singh plot for
# c-boxes for WIDTH of a uniform distribution (one of its two parameters)
some = 1e3 # reducing to 1e2 will show the approximation
par(mfrow=c(5,5))
nn = round(runif(5*5,2,25))
mm = c(-20,-1, 0, 1, 20)
ww = c(0.01, 0.1, 1, 4, 8)
for (i in 1:5) for (j in 1:5) {
a = mm[[i]] - ww[[j]]/2
b = mm[[i]] + ww[[j]]/2
n = nn[[(i-1)*5 + j]]
ccfTrk = array(0, some)
for (kk in 1:some) {
x = rbeta( n , 1 , 1 )*(b-a)
qLoc = ( max(x) - min(x) ) / (b-a)
ccfTrk[[kk]] = 1 - qLoc^(n-1) * ( n - (n-1)*qLoc )
}
plot(c(0,1),c(0,1),type='l',col=2)
lines( sort( ccfTrk) , (1:some)/some)
title(paste(n,'~U(',a,',',b,')'))
}
#########################################################################
# Scott converted Michael's rather cryptic code above to the (slower but)
# more transparent consideration below
# Singh plots for the c-box for the WIDTH of a uniform distribution
some = 1e3 # increase to show perfection
many = 2000
par(mfrow=c(5,5))
nn = round(runif(5*5,2,25))
mm = c(-20,-1, 0, 1, 20)
ww = c(0.01, 0.1, 1, 4, 8)
for (i in 1:5) for (j in 1:5) {
a = mm[[i]] - ww[[j]]/2
b = mm[[i]] + ww[[j]]/2
n = nn[[(i-1)*5 + j]]
ccf = array(0, some)
for (k in 1:some) {
x = runif(n,a,b) # sample data
w = parameter.uniform.width(x)
ccf[[k]] = sum(w < (b-a)) / many # (b-a) is ww[[j]]
}
plot(c(0,1),c(0,1),type='l',col=2)
lines( sort( ccf) , (1:some)/some)
title(paste(n,'~U(',a,',',b,')'))
}
#########################################################################
# Singh plots for the c-box for the MIDPOINT of a uniform distribution
# which is the other of its two parameters
some = 1e3 # increase to show perfection
par(mfrow=c(5,5))
nn = round(runif(5*5,2,25))
mm = c(-20,-1, 0, 1, 20)
ww = c(0.01, 0.1, 1, 4, 8)
for (i in 1:5) for (j in 1:5) {
a = mm[[i]] - ww[[j]]/2
b = mm[[i]] + ww[[j]]/2
n = nn[[(i-1)*5 + j]]
ccf = array(0, some)
for (k in 1:some) {
x = runif(n,a,b) # sample data
m = parameter.uniform.midpoint(x)
ccf[[k]] = sum(m < mm[[i]]) / many
}
plot(c(0,1),c(0,1),type='l',col=2)
lines( sort( ccf) , (1:some)/some)
title(paste(n,'~U(',a,',',b,')'))
}
#########################################################################
# Can we estimate the minimum and maximum of a uniform distribution too?
# Yes, but we must properly represent the dependence in the calculation.
# Singh plots for a NAIVE c-box for the maximum estimated by convolution
some = 1e3 # increase to show perfection
par(mfrow=c(5,5))
nn = round(runif(5*5,2,25))
mm = c(-20,-1, 0, 1, 20)
ww = c(0.01, 0.1, 1, 4, 8)
for (i in 1:5) for (j in 1:5) {
a = mm[[i]] - ww[[j]]/2
b = mm[[i]] + ww[[j]]/2
n = nn[[(i-1)*5 + j]]
ccf = array(0, some)
for (k in 1:some) {
x = runif(n,a,b) # sample data
m = parameter.uniform.midpoint(x)
w = parameter.uniform.width(x)
mw = m + w/2 # convolution
ccf[[k]] = sum(mw < b) / many
}
plot(c(0,1),c(0,1),type='l',col=2)
lines( sort( ccf) , (1:some)/some)
title(paste(n,'~U(',a,',',b,')'))
}
# these plots show that this approach is not quite correct
#########################################################################
# a better way to estimate the min and max of uniforms
# x[i] ~ uniform(minimum, maximum)
parameter.uniform.minimum <- function(x) {n=length(x); w=(max(x)-min(x))/beta(length(x)-1,2); m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1); return(m-w/2)}
parameter.uniform.maximum <- function(x) {n=length(x); w=(max(x)-min(x))/beta(length(x)-1,2); m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1); return(m+w/2)}
# Singh plot for the c-box for the MAXIMUM of a uniform distribution
some = 1e3 # increase to show perfection
par(mfrow=c(5,5))
nn = round(runif(5*5,2,25))
mm = c(-20,-1, 0, 1, 20)
ww = c(0.01, 0.1, 1, 4, 8)
for (i in 1:5) for (j in 1:5) {
a = mm[[i]] - ww[[j]]/2
b = mm[[i]] + ww[[j]]/2
n = nn[[(i-1)*5 + j]]
ccf = array(0, some)
for (k in 1:some) {
x = runif(n,a,b) # sample data
mw = parameter.uniform.maximum(x)
ccf[[k]] = sum(mw < b) / many
}
plot(c(0,1),c(0,1),type='l',col=2)
lines( sort( ccf) , (1:some)/some)
title(paste(n,'~U(',a,',',b,')'))
}
#########################################################################
# Singh plot for the c-box for the MINIMUM of a uniform distribution
some = 5e3 # increase to show perfection
par(mfrow=c(5,5))
nn = round(runif(5*5,2,25))
mm = c(-20,-1, 0, 1, 20)
ww = c(0.01, 0.1, 1, 4, 8)
for (i in 1:5) for (j in 1:5) {
a = mm[[i]] - ww[[j]]/2
b = mm[[i]] + ww[[j]]/2
n = nn[[(i-1)*5 + j]]
ccf = array(0, some)
for (k in 1:some) {
x = runif(n,a,b) # sample data
mw = parameter.uniform.minimum(x)
ccf[[k]] = sum(mw < a) / many
}
plot(c(0,1),c(0,1),type='l',col=2)
lines( sort( ccf) , (1:some)/some)
title(paste(n,'~U(',a,',',b,')'))
}
######################################################################################
# What is the dependence between the midpoint and width estimator?
# ------------------------------------------------------ parameter.uniform.width -----
width = (max(x)-min(x)) / rbeta(many, length(x)-1,2)
# ------------------------------------------------------ parameter.uniform.midpoint --
w = (max(x)-min(x)) / rbeta(many, length(x)-1,2)
midpoint = (max(x)-w/2)+(w-(max(x)-min(x)))*runif(many)
# ------------------------------------------------------ nextvalue.uniform -----------
w = (max(x)-min(x)) / rbeta(many, length(x)-1,2)
m = (max(x)-w/2)+(w-(max(x)-min(x)))*runif(many)
x = runif(8,0,1)
many = 15000
par(mfrow=c(3,1))
plot(width,midpoint); title('parameter.uniform')
plot(w,m,col='red'); title('nextvalue.uniform')
plot(width,midpoint)
points(w,m,col='red'); title('(superimposed)')
cor(width,midpoint)
cor(w,m)
####################################################################################
# What is the dependence between the minimum and maximum parameters
x = runif(20,0,1)
many = 15000
w=(max(x)-min(x))/beta(length(x)-1,2)
m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1)
minimum = m-w/2
w=(max(x)-min(x))/beta(length(x)-1,2) # indep. of earlier w
m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1) # indep. of earlier m
maximum = m+w/2
w=(max(x)-min(x))/beta(length(x)-1,2) # indep. of earlier w
m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1) # indep. of earlier m
m1 = m-w/2
m2 = m+w/2
par(mfrow=c(3,1))
plot(minimum,maximum); title('parameter.uniform')
plot(m1,m2,col='red'); title('nextvalue.uniform')
plot(m1,m2,col='red'); title('(superimposed)')
points(minimum,maximum)
cor(minimum,maximum)
cor(m1,m2)
# the correlations are quite different, but the graphs only look really different for small n
######################################################################################
# the following function alters the dependence of m and w inside the nextvalue function
bad.nextvalue.uniform <- function(x) {
n=length(x);
w =(max(x)-min(x))/beta(length(x)-1,2);
ww=(max(x)-min(x))/beta(length(x)-1,2); # independent of w
m =(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1); # midpoint depends on w
return(uniform(m-ww/2, m+ww/2)) # width is given as ww
}
many = 50000
par(mfrow=c(3,1))
for (n in c(2,4,6)) {
x = runif(n,0,1)
nv = nextvalue.uniform(x)
bnv = bad.nextvalue.uniform(x)
plotbox(bnv,xlim=c(-2,3),col='red')
plotbox(nv,col='blue',new=FALSE)
title(paste('n =',length(x)))
}
# seems only to matter when n is very small, and even then, it doesn't matter much
# Scott is only guessing that bad.nextvalue.uniform is bad...is there an analog of
# the Singh plot for the nextvalue distributions with which we can check?
####################################################################################
# compare to the dependence between a normal's mu and sigma
many=10000
x = rnorm(2,10,1)
n=length(x)
xlim = c(0,20) # depends on n, mu, sigma
ylim = c(0,15) # depends on n, mu, sigma
mu = mean(x)+sd(x)*rt(many,n-1)/sqrt(n)
sigma = sqrt(var(x)*(n-1)/rchisq(many,n-1))
s = sqrt(var(x)*(n-1)/rchisq(many,n-1))
m = rnorm(many,mean(x), s/sqrt(n)) # uses s
par(mfrow=c(3,1))
plot(mu,sigma,xlim=xlim,ylim=ylim)
plot(m,s,col='red',xlim=xlim,ylim=ylim)
plot(mu,sigma,xlim=xlim,ylim=ylim)
points(m,s,col='red')
cor(mu,sigma) # uncorrelated
cor(m,s) # strongly dependent
####################################################################################
# alter the dependence between a normal's mu and sigma
nextvalue.normal <- function(x) {
n <- length(x);
return(mean(x) + sd(x) * student(n - 1) * sqrt(1 + 1 / n))
}
student <- function(v) rt(many, v)
nv = nextvalue.normal(x)
n1 = rnorm(many, m, s) # preserves dependence
n2 = rnorm(many, mu, sigma) # alters dependence
par(mfrow=c(1,1))
xlim = range(c(nv,n1,n2))
xlim = c(5,15)
plot(NULL,xlim=xlim,ylim=c(0,1))
edf(nv,col='gray')
edf(n1,col='blue')
edf(n2,col='red')
signif(var(nv),2)
signif(var(n1),2)
signif(var(n2),2)
# gray and blue calculations preserve dependence
# red calculation alters dependence, and may be erroneous