chapter6.jl
# Chapter 6
#import Pkg
#Pkg.add("Plots")
#Pkg.add("Optim")
#Pkg.add("StatsKit")
#Pkg.add("GLM")
#Pkg.add("DataFrames")
#Pkg.add("StatsPlots")
#Pkg.add("StatFiles")
#Pkg.add("NLSolversBase")
#Pkg.add("Distributions")
using Random
using Statistics
using Plots
using Optim
using NLSolversBase
using StatsKit
using CSV
using GLM
using DataFrames
using LinearAlgebra
using StatsPlots
using StatFiles
using Distributions
# 6.1
# 6.2
# 6.2.1
# 6.2.2
rng = MersenneTwister(123457689)
N = 500
a = fill(2, N)
b = -3
x = rand!(rng, zeros(N))
υ = randn!(rng, zeros(N))
y_star = a + b.*x + υ
y = Vector{Union{Missing, Float64}}(missing,N)
for i = 1:N
if y_star[i] > 0
y[i] = y_star[i]
else
y[i] = 0
end
end
data1 = DataFrame(y = y, x = x)
lm1 = lm(@formula(y ~ x), data1)
coef(lm1)[2]
scatter(x, y)
scatter!(x, a + b.*x)
scatter!(x, coef(lm1)[1] .+ coef(lm1)[2].*x)
coef(lm(@formula(y ~ x), data1[data1.x .< 0.6,1:end]))[2]
length(data1.y[data1.x .< 0.6])
data2 = DataFrame(y = y .> 0, x = x)
coef(glm(@formula(y ~ x), data2, Binomial(), ProbitLink()))
# 6.2.3
histogram(y)
# 6.2.4
# 6.2.5
logϕ = function(z)
-z.^2 - √π
end
logΦ = function(v)
logcdf(Normal(), v)
end
Tobit = function(β, σ, y, X)
is0 = y .== 0
z = (y - X*β)./σ
return is0.*logΦ.(z) + (1 .- is0).*(logϕ.(z) .- log(σ))
end
Tobit_wrap = function(par)
σ = exp(par[1])
β = par[2:end]
return -sum(Tobit(β, σ, y, X))
end
global y = data1.y
global X = hcat(ones(N), data1.x)
init = [0; coef(lm1)]
a1 = optimize(Tobit_wrap, init)
exp(a1.minimizer[1])
a1.minimizer[2:end]
# 6.3
# 6.3.1
path = "/Volumes/My WD/Book"
x = DataFrame(CSV.read(string(path,"/NLSY97_min.csv")))
wage = Vector{Union{Missing,Float64}}(missing, 8984)
for i = 1:8984
if x[i,9] > 0 && x[i,8] > 0
wage[i] = x[i,8]/x[i,9]
end
end
# topcode wage
for i = 1:8984
if !ismissing(wage[i]) && wage[i] > quantile(skipmissing(wage), 0.9)
wage[i] = missing
end
end
lwage = Vector{Union{Missing,Float64}}(missing, 8984)
ed = Vector{Union{Missing,Float64}}(missing, 8984)
exper = Vector{Union{Missing,Float64}}(missing, 8984)
exper2 = Vector{Union{Missing,Float64}}(missing, 8984)
for i = 1:8984
if !ismissing(wage[i])
if wage[i] > 7.25
lwage[i] = log(wage[i])
else
lwage[i] = 0
end
if x.CV_HGC_EVER_EDT[i] > 0 && x.CV_HGC_EVER_EDT[i] < 25
ed[i] = x.CV_HGC_EVER_EDT[i]
end
exper[i] = 2010 - x[i,4] - ed[i] - 6
exper2[i] = (exper[i]^2)/100
end
println(i)
end
female = x.KEY!SEX .== 2
black = x.KEY!RACE_ETHNICITY .== 1
x1 = DataFrame(lwage=lwage, wage=wage, ed=ed, exper=exper, exper2=exper2,female=female,black=black, lwage_bin=lwage_bin)
x1 = dropmissing(x1)
x1.lwage_bin = x1.lwage .> 0
histogram(x1.wage[x1.wage .< 30])
vline!(fill(7.25,3972))
xlabel!("wage rate")
# 6.3.2
lm1 = lm(@formula(lwage ~ ed + exper + exper2 + female + black), x1)
glm1 = glm(@formula(lwage_bin ~ ed + exper + exper2 + female + black), x1, Binomial(), ProbitLink())
global y = x1.lwage
global X = hcat(ones(length(y)), x1.ed, x1.exper, x1.exper2, x1.female, x1.black)
init = [0; coef(glm1)]
a = optimize(Tobit_wrap, init)
exp(a.minimizer[1])
a.minimizer[2:end]
# 6.4
# 6.4.1
# 6.4.2
Random.seed!(123456789)
rng = MersenneTwister(123456789)
N = 100
a = fill(6, N)
b = -3
c = fill(4, N)
d = -5
x = rand!(rng, zeros(N))
z = rand!(rng, zeros(N))
μ = [0; 0]
ρ = 0.5
Σ = hcat([1; ρ], [ρ; 1])
Υ = rand(MvNormal(μ, Σ), N)'
y = Vector{Union{Missing, Float64}}(missing, N)
for i = 1:N
if c[i] + d*z[i] + Υ[i,1] > 0
y[i] = a[i] + b*x[i] + Υ[i,2]
else
y[i] = 0
end
end
# OLS Model
data1 = DataFrame(y = y, x = x)
lm1 = lm(@formula(y ~ x), data1)
# OLS Model with subset of data1
lm2 = lm(@formula(y ~ x), data1[z .< 0.6,1:end])
# Probit
data1.y1 = y .> 0
data1.z = z
glm1 = glm(@formula(y1 ~ z), data1, Binomial(), ProbitLink())
# 6.4.3
# 6.4.4
# 6.4.5
Heckman = function(β, γ, ρ, y, X, Z)
is0 = y .== 0
Zγ_adj = (Z*γ + ρ.*(y - X*β))./(√(1 - ρ^2))
return is0.*logΦ.(-Z*γ) + (1 .- is0).*(logΦ.(Zγ_adj) + logϕ.(y - X*β))
end
Heckman_wrap = function(par)
ρ = exp(par[1])/(1 + exp(par[2]))
J = size(X)[2]
β = par[2:(J+1)]
γ = par[(J+2):end]
return -sum(Heckman(β, γ, ρ, y, X, Z))
end
global y = data1.y
global X = hcat(ones(length(y)), data1.x)
global Z = hcat(ones(length(y)), data1.z)
init = [0; coef(lm1); coef(glm1)]
a1 = optimize(Heckman_wrap, init)
exp(a1.minimizer[1])/(1 + exp(a1.minimizer[1]))
a1.minimizer[2:3]
a1.minimizer[4:5]
# 6.5
# 6.5.1
path = "/Volumes/My WD/Book"
x = DataFrame(CSV.read(string(path,"/NLSY97_gender_book.csv")))
wage = Vector{Union{Missing,Float64}}(missing, 8984)
fulltime = Vector{Union{Missing,Float64}}(missing, 8984)
for i = 1:8984
if x[i,15] == "NA" || x[i,14] == "NA"
wage[i] = missing
elseif parse(Float64,x[i,15]) > 0
wage[i] = parse(Float64,x[i,14])/parse(Float64,x[i,15])
end
if x[i,15] == "NA"
fulltime[i] = missing
else
if parse(Float64,x[i,15]) > 1750
fulltime[i] = 1
else
fulltime[i] = 0
end
end
#println(i)
end
lwage = Vector{Union{Missing,Float64}}(missing, 8984)
lftwage = Vector{Union{Missing,Float64}}(missing, 8984)
age = Vector{Union{Missing,Float64}}(missing, 8984)
age2 = Vector{Union{Missing,Float64}}(missing, 8984)
college = Vector{Union{Missing,Float64}}(missing, 8984)
msa = Vector{Union{Missing,Float64}}(missing, 8984)
children = Vector{Union{Missing,Float64}}(missing, 8984)
for i = 1:8984
if !ismissing(wage[i])
if wage[i] > 1
lwage[i] = log(wage[i])
else
lwage[i] = 0
end
if lwage[i] > 0 && fulltime[i] == 1
lftwage[i] = lwage[i]
else
lftwage[i] = 0
end
if x.CV_HIGHEST_DEGREE_0708_2007[i] != "NA"
if parse(Int64, x.CV_HIGHEST_DEGREE_0708_2007[i]) > 2
college[i] = 1
else
college[i] = 0
end
end
msa[i] = x.CV_MSA_2007[i] ∈ ["2", "3", "4"]
if x.CV_BIO_CHILD_HH_2007[i] != "NA"
if parse(Int64,x.CV_BIO_CHILD_HH_2007[i]) > 0
children[i] = 1
else
children[i] = 0
end
end
end
println(i)
end
female = x.KEY_SEX_1997 .== 2
black = x.KEY_RACE_ETHNICITY_1997 .== 1
south = x.CV_CENSUS_REGION_2007 .== "3"
urban = x[1:end,13] .== "1"
married = x.CV_MARSTAT_COLLAPSED_2007 .== "2"
age = 2007 .- x.KEY_BDATE_Y_1997
age2 = age.^2
x1 = DataFrame(lftwage=lftwage,female=female, black=black, age=age, age2=age2, msa=msa, urban=urban, south=south, college=college, married=married, children=children)
x1 = dropmissing(x1)
x1.lftwage_bin = x1.lftwage .> 0
x1_f = x1[x1.female .== 1,1:end]
x1_m = x1[x1.female .== 0,1:end]
# Note that these samples are smaller than for the R code.
# not clear why there is a difference.
lm1 = lm(@formula(lftwage ~ age + age2 + black + college + south + msa), x1_m)
lm2 = lm(@formula(lftwage ~ age + age2 + black + college + south + msa), x1_f)
lm3 = lm(@formula(lftwage ~ female + age + age2 + black + college + south + msa), x1)
plot(kde(x1_m.lftwage[x1_m.lftwage_bin]))
plot!(kde(x1_f.lftwage[x1_f.lftwage_bin]))
# 6.5.2
glm1 = glm(@formula(lftwage_bin ~ college), x1_f, Binomial(), ProbitLink())
glm2 = glm(@formula(lftwage_bin ~ college + south), x1_f, Binomial(), ProbitLink())
glm3 = glm(@formula(lftwage_bin ~ college + south + married + children), x1_f, Binomial(), ProbitLink())
# 6.5.3
x1.lftwage_norm = (x1.lftwage .- mean(x1.lftwage))./std(x1.lftwage)
x1.lftwage_norm3 = x1.lftwage_norm
x1.lftwage_norm3[x1.lftwage .== 0] = zeros(sum(x1.lftwage .== 0))
global y = x1.lftwage_norm3[x1.female]
global X = hcat(ones(length(y)), x1.age[x1.female], x1.age2[x1.female], x1.black[x1.female], x1.college[x1.female], x1.south[x1.female], x1.msa[x1.female])
global Z = hcat(ones(length(y)), x1.college[x1.female], x1.south[x1.female], x1.married[x1.female], x1.children[x1.female])
init = [0; coef(lm2); coef(glm3)]
a = optimize(Heckman_wrap, init, show_trace=true, iterations=10000)
β̂ = a.minimizer[2:8]
yₐ = X*β̂ + randn!(rng, zeros(length(y)))
global y = x1.lftwage_norm3[x1.female .== 0]
global X = hcat(ones(length(y)), x1.age[x1.female .== 0], x1.age2[x1.female .== 0], x1.black[x1.female .== 0], x1.college[x1.female .== 0], x1.south[x1.female .== 0], x1.msa[x1.female .== 0])
y_am = X*β̂ + randn!(rng, zeros(length(y)))
plot(kde(yₐ))
plot!(kde(y_am))
plot!(kde(x1.lftwage_norm[(x1.female).*(x1.lftwage .> 0)]))
plot!(kde(x1.lftwage_norm[(x1.female .== 0).*(x1.lftwage .> 0)]))
# 6.6.1
path = "/Volumes/My WD/Book"
x = CSV.read(string(path,"/nls.csv"))
lwage76_temp = Vector{Union{Missing, Float64}}(missing,length(x.lwage76))
wage76_temp = Vector{Union{Missing, Float64}}(missing,length(x.wage76))
for i in 1:length(lwage76_temp)
if length(x.lwage76[i]) > 1
lwage76_temp[i] = parse(Float64, x.lwage76[i])
wage76_temp[i] = parse(Float64, x.wage76[i])
end
end
x.lwage76 = lwage76_temp
x.wage76 = wage76_temp
x1 = DataFrame(x)
x1 = dropmissing(x1)
x1.lwage76_norm = (x1.lwage76 .- mean(x1.lwage76))./std(x1.lwage76)
x1.exp1 = x1.age76 - x1.ed76 - fill(6,3010)
x1.exp2 = (x1.exp1.^2)./100
x1.college = x1.ed76 .> 12
# 6.6.2
lm_c = lm(@formula(lwage76_norm ~ exp1 + exp2 + black + reg76r + smsa76r), x1[x1.college,1:end])
lm_nc = lm(@formula(lwage76_norm ~ exp1 + exp2 + black + reg76r + smsa76r), x1[x1.college .== 0,1:end])
# 6.6.3
# Probit Estimate
glm1 = glm(@formula(college ~ nearc4), x1, Binomial(), ProbitLink())
glm2 = glm(@formula(college ~ nearc4 + momdad14), x1, Binomial(), ProbitLink())
glm3 = glm(@formula(college ~ nearc4 + momdad14 + black + smsa66r), x1, Binomial(), ProbitLink())
glm4 = glm(@formula(college ~ nearc4 + momdad14 + black + smsa66r + reg662 + reg663 + reg664 + reg665 + reg666 + reg667 + reg668 + reg669), x1, Binomial(), ProbitLink())
# 6.6.4
# College
global y = x1.lwage76_norm[x1.college]
global X = hcat(ones(length(y)), x1.exp1[x1.college], x1.exp2[x1.college], x1.black[x1.college], x1.reg76r[x1.college], x1.smsa76r[x1.college])
global Z = hcat(ones(length(y)), x1.nearc4[x1.college], x1.momdad14[x1.college], x1.black[x1.college], x1.smsa66r[x1.college], x1.reg662[x1.college], x1.reg663[x1.college], x1.reg664[x1.college], x1.reg665[x1.college], x1.reg666[x1.college], x1.reg667[x1.college], x1.reg668[x1.college], x1.reg669[x1.college])
init = [0; coef(lm_c); coef(glm4)]
a_c = optimize(Heckman_wrap, init, show_trace=true, iterations=10000)
# No college
global y = -x1.lwage76_norm[x1.college .== 0]
global X = hcat(ones(length(y)), x1.exp1[x1.college .== 0], x1.exp2[x1.college .== 0], x1.black[x1.college .== 0], x1.reg76r[x1.college .== 0], x1.smsa76r[x1.college .== 0])
global Z = hcat(ones(length(y)), x1.nearc4[x1.college .== 0], x1.momdad14[x1.college .== 0], x1.black[x1.college .== 0], x1.smsa66r[x1.college .== 0], x1.reg662[x1.college .== 0], x1.reg663[x1.college .== 0], x1.reg664[x1.college .== 0], x1.reg665[x1.college .== 0], x1.reg666[x1.college .== 0], x1.reg667[x1.college .== 0], x1.reg668[x1.college .== 0], x1.reg669[x1.college .== 0])
init = [0; coef(lm_nc); -coef(glm4)]
a_nc = optimize(Heckman_wrap, init, show_trace=true, iterations=10000)
# Predicted wages
μ = [0; 0]
ρ̂ = exp(a_c.minimizer[1])/(1 + exp(a_c.minimizer[1]))
Σ̂ = [[1; ρ̂], [ρ̂; 1]]
Υ = rand(MvNormal(μ, Σ̂), size(x1)[1])'
β̂_c = a_c.minimizer[2:(length(coef(lm_c))+1)]
β̂_nc = a_nc.minimizer[2:(length(coef(lm_nc))+1)]
global X = hcat(ones(size(x1)[1]), x1.exp1, x1.exp2, x1.black, x1.reg76r, x1.smsa76r)
x1.coll_y = X*β̂_c + Υ[1:end,1]
x1.nocoll_y = -X*β̂_nc + Υ[1:end,2]
scatter(x1.nocoll_y, x1.coll_y)
scatter!(x1.nocoll_y, x1.nocoll_y)
xlabel!("wage (no-college)")
ylabel!("wage (college)")
# 6.6.5
mean(x1.coll_y - x1.nocoll_y)/4