chapter5.jl
# Chapter 5
#import Pkg
#Pkg.add("Plots")
#Pkg.add("Optim")
#Pkg.add("StatsKit")
#Pkg.add("GLM")
#Pkg.add("DataFrames")
#Pkg.add("StatsPlots")
#Pkg.add("StatFiles")
#Pkg.add("NLSolversBase")
#Pkg.add("Distributions")
using Random
using Statistics
using Plots
using Optim
using NLSolversBase
using StatsKit
using CSV
using GLM
using DataFrames
using LinearAlgebra
using StatsPlots
using StatFiles
using Distributions
# 5.1
# 5.2
# 5.2.1
# 5.2.2
rng = MersenneTwister(123457689)
N = 1000
υ = randn!(rng, zeros(N)).*3 .+ 1
p = 2
mean(υ .- p .> 0)
1 - cdf(Normal(1, 3), p)
# 5.2.3
p = rand!(rng, zeros(9)).*20 .- 10
s = Vector{Union{Missing, Float64}}(missing, 9)
for i = 1:9
s[i] = mean(υ .- p[i] .> 0)
end
a = ecdf(υ)
scatter(p, 1 .- s)
scatter!(ecdf(υ))
# note the R code has survival function rather than the cdf.
# 5.3.1
# 5.3.2
N = 1000
a = fill(2, N)
b = -3
υ = randn!(rng, zeros(N))
x = rand!(rng, zeros(N))
y_star = a + b.*x + υ
y = y_star .> 0
data1 = DataFrame(y=y, x=x)
lm1 = lm(@formula(y ~ x), data1)
scatter(x, y)
scatter!(x, coef(lm1)[1] .+ coef(lm1)[2].*x)
scatter!(x, a + b.*x)
# 5.3.3
# 5.4
# 5.4.1
N = 100
p = 0.367
Head = rand!(rng, zeros(N)) .< p
mean(Head)
factorial(5)/(factorial(3)*factorial(2))
factorial(big(100))/(factorial(big(34))*factorial(big(66)))*(0.5.^100)
# 5.4.2
binom = function(p, N, p̂)
return (p̂*N*log(p) + (1 - p̂)*N*log(1 - p))
end
binom_wrap = function(p)
return -binom(p, N, mean(Head))
end
a = optimize(binom_wrap, 0, 1)
a.minimum
a.minimizer
p = 1:1000
log_lik = binom.(p./1000, N, mean(Head))
scatter(p./1000, log_lik)
vline!([mean(Head), 0.367])
# 5.4.3
# 5.4.4
# note that the codes is quite a bit different from the R code.
# standard normal density
ϕ = function(z)
(exp.(-z.^2))./√π
end
# log standard normal density
logϕ = function(z)
-z.^2 - √π
end
ols_ml = function(par, y, X)
σ = exp(par[1])
β = par[2:3]
z = (y - X*β)./σ
return -sum(logϕ.(z) .- log(σ))
end
ols_ml_wrap = function(par)
return ols_ml(par, y, X)
end
global y = y_star
global X = hcat(ones(length(y)), data1.x)
init = [0.0; 2.0; -3.0]
a = optimize(ols_ml_wrap, init)
exp(a.minimizer[1])
a.minimizer[2:3]
# 5.4.5
# 5.4.6
Φ = function(v)
cdf(Normal(), v)
end
logΦ = function(v)
logcdf(Normal(), v)
end
probit = function(β, y, X)
return (1 .- y).*logΦ.(-X*β) + y.*logΦ.(X*β)
end
probit_wrap = function(β)
return -sum(probit(β, y, X))
end
global y = data1.y
global X = hcat(ones(length(y)), data1.x)
a = optimize(probit_wrap, [2.0, -3.0])
a.minimizer
# 5.4.7
glm1 = glm(@formula(y ~ x), data1, Binomial(), ProbitLink())
# 5.5.
# 5.5.1
# 5.5.2
# 5.5.3
Random.seed!(123456789)
N = 5000
X_A = hcat(ones(N), rand(rng, N), rand(rng, N))
X_B = hcat(ones(N), rand(rng, N), rand(rng, N))
β = [1; -2; 3]
# Probit
υ_A = randn(rng, N)
y = (X_A .- X_B)*β + υ_A .> 0
data1 = DataFrame(y = y, x₁=X_A[1:end,2] .- X_B[1:end,2], x₂=X_A[1:end,3] .- X_B[1:end,3])
glm1 = glm(@formula(y ~ x₁ + x₂), data1, Binomial(), ProbitLink())
# Logit
υ_A = rand.(Gumbel(), N)
υ_B = rand.(Gumbel(), N)
y = (X_A .- X_B)*β .+ υ_A .- υ_B .> 0
data2 = DataFrame(y = y, x₁=X_A[1:end,2] .- X_B[1:end,2], x₂=X_A[1:end,3] .- X_B[1:end,3])
glm2 = glm(@formula(y ~ x₁ + x₂), data2, Binomial(), LogitLink())
# 5.6
# 5.6.1
# 5.6.2
# 5.6.3
biprobit = function(β, ρ, y, W₁, W₂, K)
rng = MersenneTwister(123457689)
ϵ = 1e-10
N = size(y)[1]
υ = randn!(rng, zeros(K))
p_00 = zeros(N)
p_10 = zeros(N)
υ_k0 = zeros(N)
for k = 1:K
υ_k0 = υ[k] .< -W₁*β
p_00 = p_00 .+ υ_k0.*Φ.((-W₂*β .- ρ*υ[k])./√(1 - ρ^2))
p_10 = p_10 .+ (1 .- υ_k0).*Φ.(((W₁ - W₂)*β .+ (1 - ρ)*υ[k])./√(1 - ρ^2))
end
loglik = (y[1:end,1] .== 0).*(y[1:end,2] .== 0).*log.(p_00./K .+ ϵ) + (y[1:end,1] .== 1).*(y[1:end,2] .== 0).*log.(p_10./K .+ ϵ) + (y[1:end,1] .== 0).*(y[1:end,2] .== 1).*log.(1 .- p_00./K .- p_10./K .+ ϵ)
end
biprobit_wrap = function(par)
ρ = exp(par[1])/(1 + exp(par[1]))
β = par[2:end]
-sum(biprobit(β, ρ, y, W₁, W₂, K))
end
# 5.6.4
# 5.6.5
f_logit = function(β, y, W₁, W₂)
ϵ = 1e-20
p_10 = exp.(W₁*β)./(1 .+ exp.(W₁*β) .+ exp.(W₂*β))
p_01 = exp.(W₂*β)./(1 .+ exp.(W₁*β) .+ exp.(W₂*β))
p_00 = 1 .- p_10 .- p_01
loglik = (y[1:end,1] .== 0).*(y[1:end,2] .== 0).*log.(p_00 .+ ϵ) + (y[1:end,1] .== 1).*(y[1:end,2] .== 0).*log.(p_10 .+ ϵ) + (y[1:end,1] .== 0).*(y[1:end,2] .== 1).*log.(p_01 .+ ϵ)
end
f_logit_wrap = function(β)
-sum(f_logit(β, y, W₁, W₂))
end
# 5.6.6
Random.seed!(123456789)
N = 1000
μ = [0; 0]
ρ = 0.1
Σ = hcat([1; ρ], [ρ; 1])
Υ = rand(MvNormal(μ, Σ), N)'
X₁ = hcat(rand!(rng, zeros(N)), rand!(rng, zeros(N)))
X₂ = hcat(rand!(rng, zeros(N)), rand!(rng, zeros(N)))
a = fill(-1, N, 2)
b = -3
c = 4
U = a + b.*X₁ + c.*X₂ + Υ
global y = hcat((U[1:end,1] .> 0).*(U[1:end,1] .> U[1:end,2]), (U[1:end,2] .> 0).*(U[1:end,2] .> U[1:end,1]))
init = [log(0.1); -1; -3; 4]
global W₁ = hcat(ones(N), X₁[1:end,1], X₂[1:end,1])
global W₂ = hcat(ones(N), X₁[1:end,2], X₂[1:end,2])
global K = 100
a₁ = optimize(biprobit_wrap, init)
a₁.minimizer[2:end]
exp(a₁.minimizer[1])/(1 + exp(a₁.minimizer[1]))
b₁ = optimize(f_logit_wrap, init[2:end])
b₁.minimizer
# 5.7
# 5.7.1
path = "ENTER PATH HERE"
x = DataFrame(CSV.read(string(path,"/hhpub.csv")))
choice = Vector{Union{Missing, String}}(missing, 129696)
home = Vector{Union{Missing, Float64}}(missing, 129696)
income = Vector{Union{Missing, Float64}}(missing, 129696)
density = Vector{Union{Missing, Float64}}(missing, 129696)
index = []
for i = 1:129696
if x.CAR[i] == 1
choice[i] = "car"
end
if x.BUS[i] == 1
choice[i] = "bus"
end
if x.TRAIN[i] == 1
choice[i] = "train"
end
if x.HOMEOWN[i] == 1
home[i] = 1
elseif x.HOMEOWN[i] > 1
home[i] = 0
end
if x.HHFAMINC[i] > 0
income[i] = x.HHFAMINC[i]
end
if x.HTPPOPDN[i] > 0
density[i] = x.HTPPOPDN[i]/1000
end
if x.WRKCOUNT[i] > 0 && (x.MSACAT[i] == 1 || x.MSACAT[i] == 2)
index = [index; i]
end
#println(i)
end
car1 = choice .== "car"
train1 = choice .== "train"
urban1 = x.URBAN .== 1
rail = x.RAIL .== 1
x1 = DataFrame(car1=car1[index], train1=train1[index], home=home[index], HHSIZE=x.HHSIZE[index], income=income[index], urban1=urban1[index], density=density[index], MSACAT=x.MSACAT[index], rail=rail[index])
x1 = dropmissing(x1)
# Note in the R code this is y, which leads to confusion below.
vars = ["car1", "train1", "home", "HHSIZE", "income", "urban1", "density"]
summ_tab = Array{Union{Missing,Float64,String}}(missing,length(vars)+1,3)
summ_tab[1,2:end] = ["Rail" "No Rail"]
summ_tab[2:end,1] = vars
for i in 1:length(vars)
summ_tab[i+1,2] = mean(x1[1:end,i][x1.rail .== 1])
summ_tab[i+1,3] = mean(x1[1:end,i][x1.rail .== 0])
end
summ_tab
# 5.7.2
# without rail
x1_nr = x1[x1.rail .== 0,1:end]
glm_nr = glm(@formula(car1 ~ home + HHSIZE + income + urban1 + density), x1_nr, Binomial())
# with rail
x1_r = x1[x1.rail .== 1,1:end]
glm_r = glm(@formula(car1 ~ home + HHSIZE + income + urban1 + density), x1_r, Binomial())
# 5.7.3
Nᵣ = size(x1_r)[1]
global W₁ = hcat(ones(Nᵣ), x1_r.home, x1_r.HHSIZE, x1_r.income, x1_r.urban1, zeros(Nᵣ))
global W₂ = hcat(ones(Nᵣ), zeros(Nᵣ), zeros(Nᵣ), zeros(Nᵣ), zeros(Nᵣ), x1_r.density)
global y = hcat(x1_r.car1,x1_r.train1)
global K = 200
init = coef(glm_r)
a₂ = optimize(f_logit_wrap, init)
a₂.minimizer
init = [0; a₂.minimizer]
a₁ = optimize(biprobit_wrap, init, show_trace=true, iterations=5000)
a₁.minimizer[2:end]
exp(a₁.minimizer[1])/(1 + exp(a₁.minimizer[1]))
# 5.7.4
N_nr = size(x1_nr)[1]
W₁ = hcat(ones(N_nr), x1_nr.home, x1_nr.HHSIZE, x1_nr.income, x1_nr.urban1, zeros(N_nr))
W₂ = hcat(ones(N_nr), zeros(N_nr), zeros(N_nr), zeros(N_nr), zeros(N_nr), x1_nr.density)
# Probit Estimate
rng = MersenneTwister(123457689)
ρ̂ = exp(a₁.minimizer[1])/(1 + exp(a₁.minimizer[1]))
β̂ = a₁.minimizer[2:end]
K = 200
υ = randn!(rng, zeros(K))
p_00 = zeros(N_nr)
p_10 = zeros(N_nr)
for k = 1:K
υ_k0 = υ[k] .< -W₁*β̂
p_00 = p_00 .+ υ_k0.*Φ.((-W₂*β̂ .- ρ̂*υ[k])./√(1 - ρ̂^2))
p_10 = p_10 .+ (1 .- υ_k0).*Φ.(((W₁ - W₂)*β̂ .+ (1 - ρ̂)*υ[k])./√(1 - ρ̂^2))
end
p_train_pr = (K .- p_00 .- p_10)./K
mean(p_train_pr)
# Logit Estimate
β̂ = a₂.minimizer
p_train_lg = exp.(W₂*β̂)./(1 .+ exp.(W₁*β̂) + exp.(W₂*β̂))
mean(p_train_lg)
# 5.8