Software

This page gives routines for computing elementary c-boxes for parameters for several sampling models and the associated p-boxes for the next observable value. All routines are written in R. Example calculations that exercise these functions and exemplify the several c-boxes are displayed in a bold typeface. We welcome alternative implementations or translations into other languages.

The functions below make references to a variety of probability distributions. There are many schemes that can be used to represent distributions. For example, the APPL package for Maple specifies the distribution symbolically so users can flexibly convert among the cdf, pdf and random-value display modes. Risk Calc uses a combination of symbolic and numerical discretization of the distribution function. The R package "distrib" simply generates a large number of random samples from the distribution and uses this population of values en mass as a characterization of the distribution. R itself never tries to represent a probability distribution per se, but only allows users various ways to access a distribution's pdf, cdf, and quantile functions, and to generate random values. The ancillary functions employed below use a particularly simple representation similar to that used in the "distrib" package. This choice makes many mathematical operations especially convenient for the purposes of the examples on this page, although it does not allow for general convolutions among the results. To compute such convolutions, you will need to replace the ancillary functions with more elaborate routines such as those available in pbox.r or Risk Calc.

This page also includes some straightforward Monte Carlo simulations for checking the performance of the results. These simulations demonstrate that the confidence intervals produced by this approach have the prescribed coverages. R functions for three such simulations are provided below. The first is coveragenormal.mu which uses Monte Carlo methods to check that the confidence distribution for the normal mean produces valid confidence intervals at any level of interest. In calling this function, specify a sample size as n and the normal distribution's true mean as MU and true standard deviation as SIGMA. You may also specify (by name) a confidence level or a pair of alpha and beta values. The second function that checks performance is coveragenormal.mudiff, which has the same arguments of sample size, true mean, and true standard deviation, but now these arguments should each be a pair of values, one for each of the two normal distributions from which samples are to be compared. The third function checks the performance of the c-box for a de-biased constant obtained from a some biased measurements of it and a few independently collected high-precision measurements of the error. Note that, for the sake of efficiency, the coverage simulations use a couple of functions that are redundant with functions defined earlier.

#############################################################################

# C-boxes for parameters and the next observable value

#############################################################################

# x[i] ~ Bernoulli(parameter), x[i] is either 0 or 1

nextvalue.bernoulli <- function(x) {n <- length(x); k <- sum(x); return(env(bernoulli(k/(n+1)), bernoulli((k+1)/(n+1))))}

parameter.bernoulli <- function(x) {n <- length(x); k <- sum(x); return(env(beta(k, n-k+1), beta(k+1, n-k)))}

# x[i] ~ binomial(N, p), for known N, x[i] is a nonnegative integer less than or equal to N

nextvalue.binomial <- function(x,N) {n <- length(x); k<-sum(x); return(env(betabinomial(N,k,n*N-k+1),betabinomial(N,k+1, n*N-k)))}

parameter.binomial <- function(x,N) {n <- length(x); k <- sum(x); return(env(beta(k, n*N-k+1), beta(k+1, n*N-k)))}

# x[i] ~ binomial(N, p), for unknown N, x[i] is a nonnegative integer

# see https://sites.google.com/site/cboxbinomialnp/

nextvalue.binomialnp <- function(x) { }

parameter.binomialnp.n <- function(x) { }

parameter.binomialnp.p <- function(x) { }

# x[i] ~ Poisson(parameter), x[i] is a nonnegative integer

nextvalue.poisson <- function(x) {n <- length(x); k = sum(x); return(env(negativebinomial(size=k, prob=1-1/(n+1)),negativebinomial(size=k+1, prob=1-1/(n+1))))}

parameter.poisson <- function(x) {n <- length(x); k = sum(x); return(env(gamma(shape=k, rate=n),gamma(shape=k+1, rate=n)))}

# x[i] ~ exponential(parameter), x[i] is a nonnegative integer

nextvalue.exponential <- function(x) {n <- length(x); k = sum(x); return(gammaexponential(shape=n, rate=k))}

parameter.exponential <- function(x) {n <- length(x); k = sum(x); return(gamma(shape=n, rate=k))}

qgammaexponential <- function(p, shape, rate=1, scale=1/rate) rate * ((1-p)^(-1/shape) - 1)

rgammaexponential <- function(many, shape, rate=1, scale=1/rate) return(qgammaexponential(runif(many),shape,rate,scale))

# x[i] ~ normal(mu, sigma)

nextvalue.normal <- function(x) {n <- length(x); return(mean(x) + sd(x) * student(n - 1) * sqrt(1 + 1 / n))}

parameter.normal.mu <- function(x) {n <- length(x); return(mean(x) + sd(x) * student(n - 1) / sqrt(n))}

parameter.normal.sigma <- function(x) {n <- length(x); return(sqrt(var(x)*(n-1)/chisquared(n-1)))}

# x[i] ~ lognormal(mu, sigma), x[i] is a positive value whose logarithm is distributed as normal(mu, sigma)

nextvalue.lognormal <- function(x) {n <- length(x); return(exp(mean(log(x)) + sd(log(x)) * student(n - 1) * sqrt(1+1/n)))}

parameter.lognormal.mu <- function(x) {n <- length(x); return(mean(log(x)) + sd(log(x)) * student(n - 1) / sqrt(n))}

parameter.lognormal.sigma <- function(x) {n <- length(x); return(sqrt(var(log(x))*(n-1)/chisquared(n-1)))}

# x[i] ~ uniform(minimum, maximum), or

# x[i] ~ uniform(midpoint, width)

nextvalue.uniform <- function(x) {n=length(x); w=(max(x)-min(x))/beta(length(x)-1,2); m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1); return(uniform(m-w/2, m+w/2))}

parameter.uniform.minimum <- function(x) {n=length(x); w=(max(x)-min(x))/beta(length(x)-1,2); m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1); return(m-w/2)}

parameter.uniform.maximum <- function(x) {n=length(x); w=(max(x)-min(x))/beta(length(x)-1,2); m=(max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1); return(m+w/2)}

parameter.uniform.width <- function(x) return((max(x)-min(x))/beta(length(x)-1,2))

parameter.uniform.midpoint <- function(x) {w=(max(x)-min(x))/beta(length(x)-1,2); return((max(x)-w/2)+(w-(max(x)-min(x)))*uniform(0,1))}

# x[i] ~ F, a continuous but unknown distribution

nextvalue.nonparametric <- function(x) return(env(histogram(c(x, Inf)), histogram(c(x, -Inf))))

# x[i] ~ normal(mu1, sigma1), y[j] ~ normal(mu2, sigma2), x and y are independent

parameter.normal.meandifference <- function(x, y) parameter.normal.mu(x) - parameter.normal.mu(y)

# x[i] = Y + error[i], error[j] ~ F, F unknown, Y fixed, x[i] and error[j] are independent

parameter.nonparametric.deconvolution <- function(x, error) {# i.e., the c-box for Y

z = NULL

for(jj in 1:length(error)) z = c(z, y - error[jj])

z = sort(z)

Q = Get_Q(length(y), length(error))

w = Q / sum( Q )

env(mixture(z,w), mixture(c(z[-1],Inf),w))

}

#############################################################################

# Ancillary functions

#############################################################################

# these functions may be replaced with better representations of distributions that make subsequent calculations easier

many = 2000 # increase for more accuracy

beta <- function(v,w) if (v==0) rep(0,many) else if (w==0) rep(1,many) else rbeta(many, v, w)

bernoulli <- function(p) runif(many) < p

betabinomial <- function(size,v,w) rbinom(many, size, beta(v,w))

negativebinomial <- function(size,prob) rnbinom(many, size, prob)

gamma <- function(shape,rate=1,scale=1/rate) {rate = 1/scale; rgamma(many, shape,rate)}

gammaexponential <- function(shape,rate=1,scale=1/rate) {rate = 1/scale; rexp(many,gamma(shape,rate))}

normal <- function(m,s) rnormal(many, m,s)

student <- function(v) rt(many, v)

chisquared <- function(v) rchisq(many, v)

lognormal <- function(m,s) rlnorm(many, m,s)

uniform <- function(a,b) runif(many, a,b)

histogram <- function(x) x[trunc(runif(many)*length(x))+1]

mixture <- function(x,w) {r=sort(runif(many)); x=c(x[1],x); w=cumsum(c(0,w))/sum(w); u=NULL; j=length(x); for (p in rev(r)) {repeat {if (w[[j]] <= p) break; j = j - 1}; u=c(x[[j+1]],u); }; u[order(runif(length(u)))] }

env <- function(x,y) c(x,y)
# alternatively...
#constant <- function(b) if (length(b)==1) TRUE else FALSE

#precise <- function(b) if (constant(b) || (length(b)==many)) TRUE else FALSE

#leftside <- function(b) if (precise(b)) return(b) else return(b[1:many])

#rightside <- function(b) if (precise(b)) return(b) else return(b[(many+1):(2*many)])

#sampleindices = function() round(runif(many)*(many-1) + 1)

#env <- function(x,y) c(pmin(sort(leftside(x)), sort(leftside(y)))[sampleindices()],pmax(sort(rightside(x)), sort(rightside(y)))[sampleindices()])

# Bayes' rule

Bzero <- 1e-6

Bone <- 1-Bzero

ratiokm <- function(k,m) return(1/(1+m/k))

ratioKN <- function(k,n) return(k/n)

jeffkm = function(k,m) return(beta(k+0.5, m+0.5))

jeffKN = function(k,n) return(beta(k+0.5, n-k+0.5))

km <- function(k,m) {

if ((k < 0) || (m < 0)) stop('Improper arguments to function km')

return(pmin(pmax(env(beta(k,m+1),beta(k+1,m)),Bzero),Bone))

}

KN = function(k,n) {

if ((k < 0) || (n < k)) stop('Improper arguments to function KN')

return(pmin(pmax(env(beta(k,n-k+1),beta(k+1,pmax(0,n-k))),Bzero), Bone))

}

Bppv = function(p,s,t) return(1/(1+((1/p-1)*(1-t))/s))

Bnpv = function(p,s,t) return(1/(1+(1-s)/(t*(1/p-1))))

# the mk argments below can be km, KN, jeffkm, jeffKN, ratiokm, ratioKN

ppv = function(pk,pm,sk,sm,tk,tm, mk=km) return(Bppv(mk(pk,pm), mk(sk,sm), mk(tk,tm)))

npv = function(pk,pm,sk,sm,tk,tm, mk=km) return(Bnpv(mk(pk,pm), mk(sk,sm), mk(tk,tm)))

ANDi = function(x,y) {

nx = length(x)

ny = length(y)

if ((nx==1) && (ny==1)) return(x*y)

c(x[1:many]*y[1:many], x[(many+1):nx]*y[(many+1):ny])

}

ORi = function(x,y) { # x+y-xy == 1-(1-x)*(1-y)

nx = length(x)

ny = length(y)

if ((nx==1) && (ny==1)) return(1-(1-x)*(1-y))

c(1-(1-x[1:many] )*(1-y[1:many]), 1-(1-x[(many+1):nx] )*(1-y[(many+1):ny]))

}

OPi = function(x,y,op) { # op can be '+', '-', '*', '/', '^', 'pmin', or 'pmax'

nx = length(x)

ny = length(y)

if (op=='-') return(OPi(x, c(-y[(many+1):ny], -y[1:many]),'+'))

if (op=='/') return(OPi(x, c(1/y[(many+1):ny], 1/y[1:many]),'*'))

if ((nx==1) && (ny==1)) return(do.call(op,list(x,y)))

if (nx==1) return(c(do.call(op,list(x,y[1:many])), do.call(op,list(x,y[(many+1):ny]))))

if (ny==1) return(c(do.call(op,list(x[1:many],y)), do.call(op,list(x[(many+1):nx],y))))

c(do.call(op,list(x[1:many],y[1:many])), do.call(op,list(x[(many+1):nx],y[(many+1):ny])))

}

opi = function(x,y,op) { # in case it is not obvious what OPi is doing

nx = length(x)

ny = length(y)

if ((nx==1) && (ny==1)) return(do.call(op,list(x,y)))

if (nx==1) return(opi(rep(x,2*many),y))

if (ny==1) return(opi(x,rep(y,2*many)))

if (op=='+') return(c(x[1:many]+y[1:many], x[(many+1):nx]+y[(many+1):ny]))

if (op=='-') return(opi(x, c(-y[(many+1):ny], -y[1:many]),'+'))

if (op=='*') return(c(x[1:many]*y[1:many], x[(many+1):nx]*y[(many+1):ny]))

if (op=='/') return(opi(x, c(1/y[(many+1):ny], 1/y[1:many]),'*'))

if (op=='^') return(c(x[1:many]^y[1:many], x[(many+1):nx]^y[(many+1):ny]))

if ((op=='min') || (op=='pmin')) return(c(pmin(x[1:many],y[1:many]), pmin(x[(many+1):nx],y[(many+1):ny])))

if ((op=='max') || (op=='pmax')) return(c(pmax(x[1:many],y[1:many]), pmax(x[(many+1):nx],y[(many+1):ny])))

stop('ERROR unknown operator in opi')

}

plotbox <- function(b,new=TRUE,col='blue',lwd=2,xlim=range(b[is.finite(b)]),ylim=c(0,1),xlab='',ylab='Prob',...) {

edf <- function (x, col, lwd, ...) {

n <- length(x)

s <- sort(x)

lines(c(s[[1]],s[[1]]),c(0,1/n),lwd=lwd,col=col,...)

for (i in 2:n) lines(c(s[[i-1]],s[[i]],s[[i]]),c(i-1,i-1,i)/n,col=col,lwd=lwd,...)

}

b = ifelse(b==-Inf, xlim[1] - 10, b)

b = ifelse(b==Inf, xlim[2] + 10, b)

if (new) plot(NULL, xlim=xlim, ylim=ylim, xlab=xlab, ylab=ylab)

if (length(b) < many) edf(b,col,lwd) else

edf(c(min(b),max(b),b[1:min(length(b),many)]),col,lwd)

if (many < length(b)) edf(c(min(b),max(b),b[(many+1):length(b)]),col,lwd)

}

ci <- function(b, c=0.95, alpha=(1-c)/2, beta=1-(1-c)/2) {

left = sort(b[1:many])[round(alpha*many)]

if (many < length(b)) right = sort(b[(many+1):length(b)])[round(beta*many)] else

right = sort(b[1:many])[round(beta*many)]

c(left,right)

}

Get_Q <- function( m_in , c_in , k = ( 0:(m_in*c_in) ) ){

Q_size_GLBL <- function( m ){ 1 + m + m*(m+1)/2 + m*(m+1)*(m+2)*(3*m+1)/24 }

Q_size_LoCL <- function( m , c ){ 1 + c + m*c*(c+1)/2 }

Grb_Q <- function( m_in , c_in , Q_list ){

m = max( m_in , c_in )

c = min( m_in , c_in )

i_min = Q_size_GLBL( m - 1 ) + Q_size_LoCL( m , c-1 ) + 1

Q_list[i_min:(i_min + m*c)]

}

AddingQ <- function( m , Q_list ){

Q_list[ Q_size_GLBL( m - 1 ) + 1 ] = 1

for( c in 1:m ){

i_min = Q_size_GLBL( m - 1 ) + Q_size_LoCL( m , c-1 ) + 1

Q1 = c( Grb_Q( m-1 , c , Q_list ) , rep(0,c) )

Q2 = c( rep(0,m), Grb_Q( m , c-1 , Q_list ) )

Q_list[ i_min:(i_min + m*c) ] = Q1 + Q2

}

Q_list[(Q_size_GLBL( m-1 ) + 1):Q_size_GLBL( m )]

}

Bld_Q <- function( m_top ){

Q_out = rep(0,Q_size_GLBL( m_top ))

Q_out[1] = 1

for( m in 1:m_top )

Q_out[ (Q_size_GLBL( m - 1 ) + 1):(Q_size_GLBL( m )) ] = AddingQ( m , Q_out )

Q_out

}

# body of Get_Q

m = max( m_in , c_in )

c = min( m_in , c_in )

Grb_Q(m, c, Bld_Q(m))[k+1]

}

###############################################################################

# Example calculations

###############################################################################

par(mfrow=c(4,4))

plotem <- function(t,d,p,q,r) {

dp = c(d,p)

if (!is.null(p)) {

plotbox(d,col='gray',xlim=range(dp[is.finite(dp)]))

plotbox(p,new=FALSE)

title(paste('next',t[1],'value'))

}

if (!missing(q)) {

plotbox(q)

title(t[2])

cat('95% confidence interval for the',t[2],'is [')

cat(signif(ci(q),3),sep=', ')

cat(']\n')

}

if (!missing(r)) {

plotbox(r)

title(t[3])

cat('95% confidence interval for the',t[3],'is [')

cat(signif(ci(r),3),sep=', ')

cat(']\n')

}

d = (runif(7) < 0.2) + 0

a = nextvalue.bernoulli(d)

b = parameter.bernoulli(d)

plotem(c('Bernoulli','Bernoulli parameter'),d,a,b)

d = rbinom(5,12,0.6)

a = nextvalue.binomial(d,12)

b = parameter.binomial(d,12)

plotem(c('binomial','binomial parameter'),d,a,b)

d = rpois(400, 12)

a = nextvalue.poisson(d)

b = parameter.poisson(d)

plotem(c('Poisson','Poisson rate'),d,a,b)

d = rexp(7, 10)

a = nextvalue.exponential(d)

b = parameter.exponential(d)

plotem(c('exponential','exponential rate'),d,a,b)

d = rnorm(6,25,5)

a = nextvalue.normal(d)

b = parameter.normal.mu(d)

c = parameter.normal.sigma(d)

plotem(c('normal','normal mean','normal sd'),d,a,b,c)

d = rlnorm(6,5,1)

a = nextvalue.lognormal(d)

b = parameter.lognormal.mu(d)

c = parameter.lognormal.sigma(d)

plotem(c('lognormal','mean of logs','sd of logs'),d,a,b,c)

d = c(2,3,5,8,3,4)

a = nextvalue.nonparametric(d)

plotem('nonparametric',d,a)

d1 = rnorm(5,20,2)

d2 = rnorm(7,16,3)

b = parameter.normal.meandifference(d1,d2)

plotem(c('','difference of means'),NULL,NULL,b)

###############################################################################

# Coverage simulations to check performance

###############################################################################

# Utility to set alpha and beta, possibly symmetrically from a confidence level

alphabeta = function(confidence=0.95,alpha=(1-confidence)/2,beta=1-(1-confidence)/2)

sort(c(alpha, beta))

# Monte Carlo simulation of the coverage performance of the normal mean c-box

coveragenormal.mu = function(n,MU,SIGMA,many=1e4,lots=1e3, ... ) {

ab = alphabeta(...)

mu = seq((MU-5*SIGMA),(MU+5*SIGMA),length.out=many)

coverage = 0

for (i in 1:lots) {

x = rnorm(n, MU, SIGMA)

h = pt(sqrt(n)*(mu-mean(x))/sd(x),n-1) # h = pcnorm.mu(mu, x)

ci = range(mu[(ab[1]<=h) & (h<=ab[2])]) # ci = cinorm.mu(x, alpha=ab[1], beta=ab[2])

if ((ci[1]<=MU)&(MU<=ci[2])) coverage=coverage+1

}

cat('With n=',n,' normal samples at (μ,σ)=(',MU,',',SIGMA,'), ',

diff(ab)*100,'% intervals performed at ', 100*coverage/lots,'%\n',sep='')

coverage/lots

}

# Confidence intervals for a normal distribution’s mean

cinorm.mu = function(x, ...) # same as ci(parameter.normal.mean(x))

mean(x)+qt(alphabeta(...),df=length(x)-1)*sd(x)/sqrt(length(x))

# Confidence box for a normal distribution's mean (distribution as a cumulative function of mu-values)

pcnorm.mu = function(mu, x) pt(sqrt(length(x))*(mu-mean(x))/sd(x),length(x)-1)

# examples

coveragenormal.mu(4, 20, 1)

for (s in c(0.1,1,2,10)) coveragenormal.mu(4, 20, s)

for (s in c(0.1,1,2,10)) coveragenormal.mu(n=4, MU=20, SIGMA=s, alpha=0.2, beta=0.8)

# Confidence box for a normal distribution's mean (distribution as a collection of deviates)

rcnorm.mu = function(many, z) # same as parameter.normal.mu(z)

mean(z)+sd(z)*rt(many, length(z)-1)/sqrt(length(z))

# Confidence intervals for the difference of normal means

cinormdiff.mu <- function(x, y, many=2000, ...) {

d = sort(rcnorm.mu(many, x) - rcnorm.mu(many, y))

range(d[round(alphabeta(...)*many)])

}

# Monte Carlo simulation of the coverage performance of the c-box for the difference of normal means

coveragenormal.mudiff=function(n,MU,SIGMA,many=1e4,lots=1e3,...){

ab = alphabeta(...)

truediff = MU[1] - MU[2]

coverage = 0

for (i in 1:lots) {

x = rnorm(n[1], MU[1], SIGMA[1])

y = rnorm(n[2], MU[2], SIGMA[2])

ci=range(sort(rcnorm.mu(many,x)-

rcnorm.mu(many,y))[round(many*ab)])

if ((ci[1] <= truediff) & (truediff <= ci[2])) coverage = coverage + 1

}

cat(' Intended',diff(ab)*100,'%\n','Observed',100*coverage/lots,'%\n')

coverage/lots

}

# examples

plotbox(rcnorm.mu(1000,rnorm(4,5,1)))

plotbox(rcnorm.mu(1000,rnorm(1000,5,1)))

coveragenormal.mudiff(n=c(5,6), MU=c(10,14), SIGMA=c(1,2), confidence=0.95)

# C-box for a constant Y masked by error e of unknown distribution and bias, given independent random samples of Y and e

nonparametric.debiasing <- function(x, error) {# i.e., the c-box for Y such that x[i] = Y + e[i]

z = NULL

for(jj in 1:length(error)) z = c(z, y - error[jj])

z = sort(z)

Q = Get_Q(length(y), length(error))

w = Q / sum( Q )

env(mixture(z,w), mixture(c(z[-1],Inf),w))

}

# Monte Carlo simulation of the coverage performance of the debiased constant c-box

coveragenonparametric.debias <- function(truth=0, err=error, few=4, some=9, many=1000, ...) {

ab = alphabeta(...)

cc = NULL

for (n in 1:many) {

e = err(some+few) # error structure

y = truth + e[(few+1):(some+few)]

e = e[1:few]

z = NULL

for(jj in 1:few) z = c(z,y - e[jj])

z = sort(z)

Q = Get_Q(some, few)

c = cumsum(Q / sum(Q))

j = length(c[c<ab[1]])+1

k = length(c[c<=ab[2]])

if ((c(-Inf , z)[j] <= truth) & (truth <= c(z , Inf)[k])) col=3 else col=4

cc = c(cc,col)

}

length(cc[cc==3])/length(cc) # coverage performance

}

#examples

coveragenonparametric.debias(truth=5, function(many) return(rnorm(many)) ) # unbiased

coveragenonparametric.debias(truth=5, function(many) return(rnorm(many,10)) ) # biased

error <- function(many) return(rbeta(many, 2.0, 8.0) * 4 + 3 ) # this error structure could be anything

coveragenonparametric.debias()

coveragenonparametric.debias(alpha=0.9, beta=0.1)

coveragenonparametric.debias(alpha=0.9, beta=0.0)

Page updated

Google Sites

Report abuse