chapter6.jl

# Chapter 6


#import Pkg

#Pkg.add("Plots")

#Pkg.add("Optim")

#Pkg.add("StatsKit")

#Pkg.add("GLM")

#Pkg.add("DataFrames")

#Pkg.add("StatsPlots")

#Pkg.add("StatFiles")

#Pkg.add("NLSolversBase")

#Pkg.add("Distributions")

using Random

using Statistics

using Plots

using Optim

using NLSolversBase

using StatsKit

using CSV

using GLM

using DataFrames

using LinearAlgebra

using StatsPlots

using StatFiles

using Distributions


# 6.1


# 6.2

# 6.2.1

# 6.2.2


rng = MersenneTwister(123457689)

N = 500

a = fill(2, N)

b = -3

x = rand!(rng, zeros(N))

υ = randn!(rng, zeros(N))

y_star = a + b.*x + υ

y = Vector{Union{Missing, Float64}}(missing,N)

for i = 1:N

if y_star[i] > 0

y[i] = y_star[i]

else

y[i] = 0

end

end

data1 = DataFrame(y = y, x = x)


lm1 = lm(@formula(y ~ x), data1)

coef(lm1)[2]


scatter(x, y)

scatter!(x, a + b.*x)

scatter!(x, coef(lm1)[1] .+ coef(lm1)[2].*x)


coef(lm(@formula(y ~ x), data1[data1.x .< 0.6,1:end]))[2]

length(data1.y[data1.x .< 0.6])


data2 = DataFrame(y = y .> 0, x = x)

coef(glm(@formula(y ~ x), data2, Binomial(), ProbitLink()))


# 6.2.3


histogram(y)


# 6.2.4

# 6.2.5


logϕ = function(z)

-z.^2 - √π

end


logΦ = function(v)

logcdf(Normal(), v)

end


Tobit = function(β, σ, y, X)

is0 = y .== 0

z = (y - X*β)./σ

return is0.*logΦ.(z) + (1 .- is0).*(logϕ.(z) .- log(σ))

end


Tobit_wrap = function(par)

σ = exp(par[1])

β = par[2:end]

return -sum(Tobit(β, σ, y, X))

end


global y = data1.y

global X = hcat(ones(N), data1.x)

init = [0; coef(lm1)]

a1 = optimize(Tobit_wrap, init)

exp(a1.minimizer[1])

a1.minimizer[2:end]



# 6.3

# 6.3.1


path = "/Volumes/My WD/Book"

x = DataFrame(CSV.read(string(path,"/NLSY97_min.csv")))

wage = Vector{Union{Missing,Float64}}(missing, 8984)

for i = 1:8984

if x[i,9] > 0 && x[i,8] > 0

wage[i] = x[i,8]/x[i,9]

end

end

# topcode wage

for i = 1:8984

if !ismissing(wage[i]) && wage[i] > quantile(skipmissing(wage), 0.9)

wage[i] = missing

end

end

lwage = Vector{Union{Missing,Float64}}(missing, 8984)

ed = Vector{Union{Missing,Float64}}(missing, 8984)

exper = Vector{Union{Missing,Float64}}(missing, 8984)

exper2 = Vector{Union{Missing,Float64}}(missing, 8984)

for i = 1:8984

if !ismissing(wage[i])

if wage[i] > 7.25

lwage[i] = log(wage[i])

else

lwage[i] = 0

end

if x.CV_HGC_EVER_EDT[i] > 0 && x.CV_HGC_EVER_EDT[i] < 25

ed[i] = x.CV_HGC_EVER_EDT[i]

end

exper[i] = 2010 - x[i,4] - ed[i] - 6

exper2[i] = (exper[i]^2)/100

end

println(i)

end

female = x.KEY!SEX .== 2

black = x.KEY!RACE_ETHNICITY .== 1

x1 = DataFrame(lwage=lwage, wage=wage, ed=ed, exper=exper, exper2=exper2,female=female,black=black, lwage_bin=lwage_bin)

x1 = dropmissing(x1)

x1.lwage_bin = x1.lwage .> 0


histogram(x1.wage[x1.wage .< 30])

vline!(fill(7.25,3972))

xlabel!("wage rate")


# 6.3.2

lm1 = lm(@formula(lwage ~ ed + exper + exper2 + female + black), x1)

glm1 = glm(@formula(lwage_bin ~ ed + exper + exper2 + female + black), x1, Binomial(), ProbitLink())


global y = x1.lwage

global X = hcat(ones(length(y)), x1.ed, x1.exper, x1.exper2, x1.female, x1.black)

init = [0; coef(glm1)]

a = optimize(Tobit_wrap, init)

exp(a.minimizer[1])

a.minimizer[2:end]



# 6.4

# 6.4.1

# 6.4.2

Random.seed!(123456789)

rng = MersenneTwister(123456789)

N = 100

a = fill(6, N)

b = -3

c = fill(4, N)

d = -5

x = rand!(rng, zeros(N))

z = rand!(rng, zeros(N))

μ = [0; 0]

ρ = 0.5

Σ = hcat([1; ρ], [ρ; 1])

Υ = rand(MvNormal(μ, Σ), N)'

y = Vector{Union{Missing, Float64}}(missing, N)

for i = 1:N

if c[i] + d*z[i] + Υ[i,1] > 0

y[i] = a[i] + b*x[i] + Υ[i,2]

else

y[i] = 0

end

end


# OLS Model

data1 = DataFrame(y = y, x = x)

lm1 = lm(@formula(y ~ x), data1)


# OLS Model with subset of data1

lm2 = lm(@formula(y ~ x), data1[z .< 0.6,1:end])


# Probit

data1.y1 = y .> 0

data1.z = z

glm1 = glm(@formula(y1 ~ z), data1, Binomial(), ProbitLink())


# 6.4.3

# 6.4.4

# 6.4.5


Heckman = function(β, γ, ρ, y, X, Z)

is0 = y .== 0

Zγ_adj = (Z*γ + ρ.*(y - X*β))./(√(1 - ρ^2))

return is0.*logΦ.(-Z*γ) + (1 .- is0).*(logΦ.(Zγ_adj) + logϕ.(y - X*β))

end


Heckman_wrap = function(par)

ρ = exp(par[1])/(1 + exp(par[2]))

J = size(X)[2]

β = par[2:(J+1)]

γ = par[(J+2):end]

return -sum(Heckman(β, γ, ρ, y, X, Z))

end


global y = data1.y

global X = hcat(ones(length(y)), data1.x)

global Z = hcat(ones(length(y)), data1.z)

init = [0; coef(lm1); coef(glm1)]

a1 = optimize(Heckman_wrap, init)

exp(a1.minimizer[1])/(1 + exp(a1.minimizer[1]))

a1.minimizer[2:3]

a1.minimizer[4:5]



# 6.5

# 6.5.1


path = "/Volumes/My WD/Book"

x = DataFrame(CSV.read(string(path,"/NLSY97_gender_book.csv")))

wage = Vector{Union{Missing,Float64}}(missing, 8984)

fulltime = Vector{Union{Missing,Float64}}(missing, 8984)

for i = 1:8984

if x[i,15] == "NA" || x[i,14] == "NA"

wage[i] = missing

elseif parse(Float64,x[i,15]) > 0

wage[i] = parse(Float64,x[i,14])/parse(Float64,x[i,15])

end

if x[i,15] == "NA"

fulltime[i] = missing

else

if parse(Float64,x[i,15]) > 1750

fulltime[i] = 1

else

fulltime[i] = 0

end

end

#println(i)

end

lwage = Vector{Union{Missing,Float64}}(missing, 8984)

lftwage = Vector{Union{Missing,Float64}}(missing, 8984)

age = Vector{Union{Missing,Float64}}(missing, 8984)

age2 = Vector{Union{Missing,Float64}}(missing, 8984)

college = Vector{Union{Missing,Float64}}(missing, 8984)

msa = Vector{Union{Missing,Float64}}(missing, 8984)

children = Vector{Union{Missing,Float64}}(missing, 8984)

for i = 1:8984

if !ismissing(wage[i])

if wage[i] > 1

lwage[i] = log(wage[i])

else

lwage[i] = 0

end

if lwage[i] > 0 && fulltime[i] == 1

lftwage[i] = lwage[i]

else

lftwage[i] = 0

end

if x.CV_HIGHEST_DEGREE_0708_2007[i] != "NA"

if parse(Int64, x.CV_HIGHEST_DEGREE_0708_2007[i]) > 2

college[i] = 1

else

college[i] = 0

end

end

msa[i] = x.CV_MSA_2007[i] ∈ ["2", "3", "4"]

if x.CV_BIO_CHILD_HH_2007[i] != "NA"

if parse(Int64,x.CV_BIO_CHILD_HH_2007[i]) > 0

children[i] = 1

else

children[i] = 0

end

end

end

println(i)

end

female = x.KEY_SEX_1997 .== 2

black = x.KEY_RACE_ETHNICITY_1997 .== 1

south = x.CV_CENSUS_REGION_2007 .== "3"

urban = x[1:end,13] .== "1"

married = x.CV_MARSTAT_COLLAPSED_2007 .== "2"

age = 2007 .- x.KEY_BDATE_Y_1997

age2 = age.^2


x1 = DataFrame(lftwage=lftwage,female=female, black=black, age=age, age2=age2, msa=msa, urban=urban, south=south, college=college, married=married, children=children)

x1 = dropmissing(x1)

x1.lftwage_bin = x1.lftwage .> 0

x1_f = x1[x1.female .== 1,1:end]

x1_m = x1[x1.female .== 0,1:end]

# Note that these samples are smaller than for the R code.

# not clear why there is a difference.


lm1 = lm(@formula(lftwage ~ age + age2 + black + college + south + msa), x1_m)

lm2 = lm(@formula(lftwage ~ age + age2 + black + college + south + msa), x1_f)

lm3 = lm(@formula(lftwage ~ female + age + age2 + black + college + south + msa), x1)


plot(kde(x1_m.lftwage[x1_m.lftwage_bin]))

plot!(kde(x1_f.lftwage[x1_f.lftwage_bin]))


# 6.5.2

glm1 = glm(@formula(lftwage_bin ~ college), x1_f, Binomial(), ProbitLink())

glm2 = glm(@formula(lftwage_bin ~ college + south), x1_f, Binomial(), ProbitLink())

glm3 = glm(@formula(lftwage_bin ~ college + south + married + children), x1_f, Binomial(), ProbitLink())


# 6.5.3

x1.lftwage_norm = (x1.lftwage .- mean(x1.lftwage))./std(x1.lftwage)

x1.lftwage_norm3 = x1.lftwage_norm

x1.lftwage_norm3[x1.lftwage .== 0] = zeros(sum(x1.lftwage .== 0))

global y = x1.lftwage_norm3[x1.female]

global X = hcat(ones(length(y)), x1.age[x1.female], x1.age2[x1.female], x1.black[x1.female], x1.college[x1.female], x1.south[x1.female], x1.msa[x1.female])

global Z = hcat(ones(length(y)), x1.college[x1.female], x1.south[x1.female], x1.married[x1.female], x1.children[x1.female])

init = [0; coef(lm2); coef(glm3)]

a = optimize(Heckman_wrap, init, show_trace=true, iterations=10000)

β̂ = a.minimizer[2:8]

yₐ = X*β̂ + randn!(rng, zeros(length(y)))

global y = x1.lftwage_norm3[x1.female .== 0]

global X = hcat(ones(length(y)), x1.age[x1.female .== 0], x1.age2[x1.female .== 0], x1.black[x1.female .== 0], x1.college[x1.female .== 0], x1.south[x1.female .== 0], x1.msa[x1.female .== 0])

y_am = X*β̂ + randn!(rng, zeros(length(y)))


plot(kde(yₐ))

plot!(kde(y_am))

plot!(kde(x1.lftwage_norm[(x1.female).*(x1.lftwage .> 0)]))

plot!(kde(x1.lftwage_norm[(x1.female .== 0).*(x1.lftwage .> 0)]))


# 6.6.1

path = "/Volumes/My WD/Book"

x = CSV.read(string(path,"/nls.csv"))

lwage76_temp = Vector{Union{Missing, Float64}}(missing,length(x.lwage76))

wage76_temp = Vector{Union{Missing, Float64}}(missing,length(x.wage76))

for i in 1:length(lwage76_temp)

if length(x.lwage76[i]) > 1

lwage76_temp[i] = parse(Float64, x.lwage76[i])

wage76_temp[i] = parse(Float64, x.wage76[i])

end

end

x.lwage76 = lwage76_temp

x.wage76 = wage76_temp


x1 = DataFrame(x)

x1 = dropmissing(x1)


x1.lwage76_norm = (x1.lwage76 .- mean(x1.lwage76))./std(x1.lwage76)

x1.exp1 = x1.age76 - x1.ed76 - fill(6,3010)

x1.exp2 = (x1.exp1.^2)./100

x1.college = x1.ed76 .> 12


# 6.6.2


lm_c = lm(@formula(lwage76_norm ~ exp1 + exp2 + black + reg76r + smsa76r), x1[x1.college,1:end])

lm_nc = lm(@formula(lwage76_norm ~ exp1 + exp2 + black + reg76r + smsa76r), x1[x1.college .== 0,1:end])


# 6.6.3


# Probit Estimate

glm1 = glm(@formula(college ~ nearc4), x1, Binomial(), ProbitLink())

glm2 = glm(@formula(college ~ nearc4 + momdad14), x1, Binomial(), ProbitLink())

glm3 = glm(@formula(college ~ nearc4 + momdad14 + black + smsa66r), x1, Binomial(), ProbitLink())

glm4 = glm(@formula(college ~ nearc4 + momdad14 + black + smsa66r + reg662 + reg663 + reg664 + reg665 + reg666 + reg667 + reg668 + reg669), x1, Binomial(), ProbitLink())


# 6.6.4


# College

global y = x1.lwage76_norm[x1.college]

global X = hcat(ones(length(y)), x1.exp1[x1.college], x1.exp2[x1.college], x1.black[x1.college], x1.reg76r[x1.college], x1.smsa76r[x1.college])

global Z = hcat(ones(length(y)), x1.nearc4[x1.college], x1.momdad14[x1.college], x1.black[x1.college], x1.smsa66r[x1.college], x1.reg662[x1.college], x1.reg663[x1.college], x1.reg664[x1.college], x1.reg665[x1.college], x1.reg666[x1.college], x1.reg667[x1.college], x1.reg668[x1.college], x1.reg669[x1.college])

init = [0; coef(lm_c); coef(glm4)]

a_c = optimize(Heckman_wrap, init, show_trace=true, iterations=10000)


# No college

global y = -x1.lwage76_norm[x1.college .== 0]

global X = hcat(ones(length(y)), x1.exp1[x1.college .== 0], x1.exp2[x1.college .== 0], x1.black[x1.college .== 0], x1.reg76r[x1.college .== 0], x1.smsa76r[x1.college .== 0])

global Z = hcat(ones(length(y)), x1.nearc4[x1.college .== 0], x1.momdad14[x1.college .== 0], x1.black[x1.college .== 0], x1.smsa66r[x1.college .== 0], x1.reg662[x1.college .== 0], x1.reg663[x1.college .== 0], x1.reg664[x1.college .== 0], x1.reg665[x1.college .== 0], x1.reg666[x1.college .== 0], x1.reg667[x1.college .== 0], x1.reg668[x1.college .== 0], x1.reg669[x1.college .== 0])

init = [0; coef(lm_nc); -coef(glm4)]

a_nc = optimize(Heckman_wrap, init, show_trace=true, iterations=10000)


# Predicted wages

μ = [0; 0]

ρ̂ = exp(a_c.minimizer[1])/(1 + exp(a_c.minimizer[1]))

Σ̂ = [[1; ρ̂], [ρ̂; 1]]

Υ = rand(MvNormal(μ, Σ̂), size(x1)[1])'

β̂_c = a_c.minimizer[2:(length(coef(lm_c))+1)]

β̂_nc = a_nc.minimizer[2:(length(coef(lm_nc))+1)]

global X = hcat(ones(size(x1)[1]), x1.exp1, x1.exp2, x1.black, x1.reg76r, x1.smsa76r)

x1.coll_y = X*β̂_c + Υ[1:end,1]

x1.nocoll_y = -X*β̂_nc + Υ[1:end,2]


scatter(x1.nocoll_y, x1.coll_y)

scatter!(x1.nocoll_y, x1.nocoll_y)

xlabel!("wage (no-college)")

ylabel!("wage (college)")


# 6.6.5


mean(x1.coll_y - x1.nocoll_y)/4