chapter12.jl
# Chapter 12
#import Pkg
#Pkg.add("Plots")
#Pkg.add("Optim")
#Pkg.add("StatsKit")
#Pkg.add("GLM")
#Pkg.add("DataFrames")
#Pkg.add("StatsPlots")
#Pkg.add("StatFiles")
#Pkg.add("NLSolversBase")
#Pkg.add("Distributions")
using Random
using Statistics
using Plots
using Optim
using NLSolversBase
using StatsKit
using CSV
using GLM
using DataFrames
using LinearAlgebra
using StatsPlots
using StatFiles
using Distributions
# 12.1
# 12.2
# 12.2.1
Random.seed!(123456789)
N = 10000
μ = [0; 10; -1; 3]
Σ = zeros(4,4) + Diagonal([2; 1; 3; 1])
x = rand(MvNormal(μ, Σ), N)'
α = 0.4
ranN = rand(N)
y₁ = zeros(N)
y₂ = zeros(N)
for i = 1:N
if ranN[i] < α
y₁[i] = x[i,1]
y₂[i] = x[i,3]
else
y₁[i] = x[i,2]
y₂[i] = x[i,4]
end
end
density(y₁)
density!(y₂, style = :dash)
# 12.2.2
# some useful functions
find_dens = function(h, x)
ls_x = (h.x .- x).^2
a = which_min(ls_x)
min_a = maximum([a - 5; 1])
max_a = minimum([a + 5; 512])
return mean(h.density[min_a:max_a])
end
which_min = function(x)
min_x = minimum(x)
for i = 1:length(x)
if x[i] == min_x
return i
end
end
return "Error"
end
which_max = function(x)
max_x = maximum(x)
for i = 1:length(x)
if x[i] == max_x
return i
end
end
return "Error"
end
q_y₂ = quantile(y₂, 0.5)
a = find_dens(kde(y₂), q_y₂)
b = pdf(Normal(3), q_y₂)
c = pdf(Normal(-1, √3), q_y₂)
(a - b)/(c - b)
# 12.2.3
min1 = minimum([y₁; y₂])
max1 = maximum([y₁; y₂])
K₁ = 10
K₂ = 7
ϵ = 1e-10
diff1 = (max1 - min1)/K₁
diff2 = (max1 - min1)/K₂
prob_mat = Array{Union{Missing, Float64}}(missing, K₂+1, K₁+1)
# conditional probability
prob_mat_x = Array{Union{Missing, Float64}}(missing, K₂+1, 3)
# actual hidden type conditional probability
low1 = min1 - ϵ
high1 = low1 + ϵ + diff1
for k₁ = 1:K₁
low2 = min1 - ϵ
high2 = low2 + ϵ + diff2
prob_mat[1,1+k₁] = (low1 + high1)/2
y₁_c = y₁[(y₂ .> low1).*(y₂ .< high1)]
for k₂ = 1:K₂
prob_mat[1 + k₂, 1] = (low2 + high2)/2
prob_mat[1 + k₂, 1 + k₁] = mean((y₁_c .> low2).*(y₁_c .< high2))
prob_mat_x[1 + k₂, 2] = mean((x[1:end,1] .> low2).*(x[1:end,1] .< high2))
prob_mat_x[1 + k₂, 3] = mean((x[1:end,2] .> low2).*(x[1:end,2] .< high2))
# probabilities conditional on hidden type
low2 = high2 - ϵ
high2 = low2 + ϵ + diff2
#println(k₂)
end
low1 = high1 - ϵ
high1 = low1 + ϵ + diff1
end
prob_mat_x[1:end,1] = prob_mat[1:end,1]
prob_mat
prob_mat_x
scatter(prob_mat[3,2:8],prob_mat[7,2:8])
scatter!(prob_mat_x[[3;7],2],prob_mat_x[[3;7],3], shape = :rect)
xlabel!("y₁ is low")
ylabel!("y₁ is high")
# 12.3
# 12.3.1
# 12.3.2
Random.seed!(123456789)
N = 10000
υ = 1:4
p = [0.2; 0.5; 0.25; 0.05]
ranN = rand(N)
θ = zeros(N)
for i = 1:N
if ranN[i] < p[1]
θ[i] = υ[1]
elseif ranN[i] < sum(p[1:2])
θ[i] = υ[2]
elseif ranN[i] < sum(p[1:3])
θ[i] = υ[3]
else
θ[i] = υ[4]
end
end
e₁ = randn(N).*0.3
e₂ = randn(N).*0.3
x₁ = 1 .+ 0.25.*θ + e₁
x₂ = 0.5 .+ 0.1.*θ + e₂
quantile(x₁, [0, 0.25, 0.5, 0.75, 1])
quantile(x₂, [0, 0.25, 0.5, 0.75, 1])
# 12.3.3
# 12.3.4
# 12.3.5
# 12.3.6
mixture = function(X; centers=2, tol=1e-09, maxiter=500, verb=false)
# the function accets either a matrix or the number of hidden types
if length(centers) > 1
K = size(centers)[1]
else
K = centers
end
# initial types
t = assignments(kmeans!(X', centers'))
T = zeros(size(X)[1], K)
for k = 1:K
T[1:end,k] = t .== k
end
# determine the initial prior estimates
g₁ = []
for k = 1:K
g₁ = [g₁; mean(T[1:end,k])]
end
g₁
# loop to calculate the prior and posterior
diff = sum(abs.(1 .- g₁))
i = 0
while diff > tol && i < maxiter
# determine likelihoods
H = H_fun(X, t, K)
# solve for the posteriors
global γ̂ = γ_fun(X, H, g₁)
# update that categorization of types
for i = 1:size(X)[1]
t[i] = which_max(γ̂[i,1:end])
end
# determine the initial prior estimates
g₂ = []
for k = 1:K
g₂ = [g₂; mean(γ̂[1:end,k])]
end
g₂ = g₂./sum(g₂)
# check for convergence
diff = sum(abs.(g₁ - g₂))
g₁ = g₂
i += 1
if verb
println(diff)
println(i)
end
end
return (g = g₁, γ̂ = γ̂, t = t)
end
H_fun = function(X, t, K)
H = []
for j = 1:size(X)[2]
for k = 1:K
if sum(t .== k) > 0
H = [H; kde(X[t .== k,j], npoints=512, boundary=(minimum(X), maximum(X)))]
else
H = [H; kde(ones(512), npoints=512)]
end
H[end].density = H[end].density./sum(H[end].density)
end
end
return H
end
γ_fun = function(X, H, g)
N = size(X)[1]
J = size(X)[2]
K = length(g)
γ = zeros(N, K)
for i = 1:N
for k = 1:K
ĥ = log(g[k])
for j = 1:J
ĥ = ĥ + log(find_dens(H[K*(j-1)+k], X[i,j]))
end
γ[i,k] = exp(ĥ)
end
γ[i,1:end] = γ[i,1:end]./sum(γ[i,1:end])
end
return γ
end
centers = hcat(1 .+ 0.25.*υ, 0.5 .+ 0.1.*υ)
# columns is the number of signals (2 in this case)
# rows is the number types (4 in this case)
X = hcat(x₁, x₂)
a = mixture(X, centers=centers, verb=true)
mean(x₁[a.t .== 1])
mean(x₁[a.t .== 2])
mean(x₁[a.t .== 3])
mean(x₁[a.t .== 4])
plot(kde(x₁[a.t .== 1]))
plot!(kde(x₁[a.t .== 2]))
plot!(kde(x₁[a.t .== 3]))
plot!(kde(x₁[a.t .== 4]))
xlabel!("x₁")
ylabel!("Density")
# 12.4.2
path = "ENTER PATH HERE"
x = DataFrame(load(string(path,"/pubtwins.dta")))
x_f = x[ismissing.(x.first) .== 0,1:end] # first twin
x_s = x[ismissing.(x.first),1:end] # second twin
# educ "Twin1 Education"
# educt "Twin1 Report of Twin2 Education"
# educ_t "Twin2 Education"
# educt_t "Twin2 Report of Twin1 Education"
# the data is written out in pairs for each set of twins
x_f1 = dropmissing(DataFrame(dlwage = x_f.dlwage, deduc = x_f.deduc, duncov = x_f.duncov, dmaried = x_f.dmaried, dtenure = x_f.dtenure))
X = hcat(ones(340), x_f.deduc)
y = x_f.dlwage
β₁ = inv(X'X)*X'y
X = hcat(ones(333), x_f1.deduc, x_f1.duncov, x_f1.dmaried, x_f1.dtenure)
y = x_f1.dlwage
β₂ = inv(X'X)*X'y
scatter(x_f.deduc, x_f.dlwage, legend = :none)
plot!(x_f.deduc, hcat(ones(340), x_f.deduc)*β₁)
xlabel!("Diff Years of Education")
ylabel!("Diff of Log Wages")
# 12.4.3
y_f = hcat(x_f.educ, x_f.educt_t)
y_s = hcat(x_s.educ, x_s.educt_t)
y = vcat(y_f, y_s)
centers = hcat([12.0; 14.0; 16.0; 18.0], [12.0; 14.0; 16.0; 18.0])
a = mixture(y, centers=centers, verb=true)
plot(kde(y_f[a.t .== 1]))
plot!(kde(y_f[a.t .== 2]))
plot!(kde(y_f[a.t .== 3]))
plot!(kde(y_f[a.t .== 4]))
xlabel!("Education Level")
ylabel!("Density")
# 12.4.4
ed = [12.0; 14.0; 16.0; 18.0]
x_f.deduc = a.γ̂[1:340,1:end]*ed - a.γ̂[341:680,1:end]*ed
scatter(a.γ̂[1:340,1:end]*ed, a.γ̂[341:680,1:end]*ed)
X = hcat(ones(340), x_f.deduc)
y = x_f.dlwage
β₃ = inv(X'X)*X'y
y = x_f.dlwage[1]
X = [1 x_f.deduc[1] x_f.duncov[1] x_f.dmaried[1] x_f.dtenure[1]]
for i = 2:340
if !ismissing(x_f.dtenure[i])
y = [y; x_f.dlwage[i]]
X = [X; [1 x_f.deduc[i] x_f.duncov[i] x_f.dmaried[i] x_f.dtenure[i]]]
end
end
β₄ = inv(X'X)*X'y
# 12.5
# 12.5.1
path = "ENTER PATH HERE"
x = DataFrame(CSV.read(string(path,"/cardkrueger.csv")))
EMPFT = Vector{Union{Missing, Float64}}(missing, 410)
EMPPT = Vector{Union{Missing, Float64}}(missing, 410)
EMPFT2 = Vector{Union{Missing, Float64}}(missing, 410)
EMPPT2 = Vector{Union{Missing, Float64}}(missing, 410)
for i = 1:410
if !(x.EMPFT[i] == "." || x.EMPPT[i] == "." || x.EMPFT2[i] == "." || x.EMPPT2[i] == ".")
EMPFT[i] = parse(Float64, x.EMPFT[i])
EMPPT[i] = parse(Float64, x.EMPPT[i])
EMPFT2[i] = parse(Float64, x.EMPFT2[i])
EMPPT2[i] = parse(Float64, x.EMPPT2[i])
end
end
x1 = DataFrame(SHEET = x.SHEET, FT = EMPFT, PT = EMPPT, STATE = x.STATE, TIME = ones(410))
x1.SHEET[x1.SHEET .== 407] = [4071; 4072]
# there is an issue with the labeling in the data
# two firms with the same label
x2 = DataFrame(SHEET = x.SHEET, FT = EMPFT2, PT = EMPPT2, STATE = x.STATE, TIME = ones(410) .+ 1)
x2.SHEET[x2.SHEET .== 407] = [4071; 4072]
x3 = vcat(x1, x2)
Y = hcat(x3.FT[x3.TIME .== 1], x3.PT[x3.TIME .== 1], x3.FT[x3.TIME .== 2], x3.PT[x3.TIME .== 2])'
treat = x3.STATE[x3.TIME .== 1] .== 1
index_na = []
for j = 1:410
if !ismissing(Y[1,j]) && !ismissing(Y[2,j]) && !ismissing(Y[3,j]) && !ismissing(Y[4,j])
index_na = [index_na; j]
end
end
Y1 = Y[1:2,index_na]
Y2 = Y[3:4,index_na]
treat1 = treat[index_na]
quantile(Y1[1,1:end], [0; 0.25; 0.5; 0.75; 1])
mean(Y1[1,1:end])
quantile(Y1[2,1:end], [0; 0.25; 0.5; 0.75; 1])
mean(Y1[2,1:end])
# 12.5.2
p₁ = [2.0; 2.0; 2.0; 6.0]
p₂ = [11.0; 17.0; 25.0; 11.0]
a = mixture(hcat(Y1[1,1:end], Y1[2,1:end]), centers = hcat(p₁, p₂), verb=true, tol=1e-15)
a.g
# The mixture model ends up with a somewhat different set of types, relative to the R model.
# The result of which is that the treatment effect results are different.
# 12.5.3
scatter(Y1[1,1:end][a.t .== 1], Y1[2,1:end][a.t .== 1])
scatter!(Y1[1,1:end][a.t .== 2], Y1[2,1:end][a.t .== 2])
scatter!(Y1[1,1:end][a.t .== 3], Y1[2,1:end][a.t .== 3])
scatter!(Y1[1,1:end][a.t .== 4], Y1[2,1:end][a.t .== 4])
xlabel!("# Full Time")
ylabel!("# Part Time")
scatter(Y1[1,1:end][(a.t .== 1).*(treat1 .== 1)], Y1[2,1:end][(a.t .== 1).*(treat1 .== 1)], label="NJ")
scatter!(Y1[1,1:end][(a.t .== 1).*(treat1 .== 0)], Y1[2,1:end][(a.t .== 1).*(treat1 .== 0)], shape = :rect, label="PA")
xlabel!("# Full-Time (Before)")
ylabel!("# Part-Time (Before)")
scatter(Y2[1,1:end][(a.t .== 1).*(treat1 .== 1)], Y2[2,1:end][(a.t .== 1).*(treat1 .== 1)], label="NJ")
scatter!(Y2[1,1:end][(a.t .== 1).*(treat1 .== 0)], Y2[2,1:end][(a.t .== 1).*(treat1 .== 0)], shape = :rect, label="PA")
xlabel!("# Full-Time (After)")
ylabel!("# Part-Time (After)")
scatter(Y1[1,1:end][(a.t .== 2).*(treat1 .== 1)], Y1[2,1:end][(a.t .== 2).*(treat1 .== 1)], label="NJ")
scatter!(Y1[1,1:end][(a.t .== 2).*(treat1 .== 0)], Y1[2,1:end][(a.t .== 2).*(treat1 .== 0)], shape = :rect, label="PA")
xlabel!("# Full-Time (Before)")
ylabel!("# Part-Time (Before)")
scatter(Y2[1,1:end][(a.t .== 2).*(treat1 .== 1)], Y2[2,1:end][(a.t .== 2).*(treat1 .== 1)], label="NJ")
scatter!(Y2[1,1:end][(a.t .== 2).*(treat1 .== 0)], Y2[2,1:end][(a.t .== 2).*(treat1 .== 0)], shape = :rect, label="PA")
xlabel!("# Full-Time (After)")
ylabel!("# Part-Time (After)")
did = function(y, treat)
y₁ = y[1,1:end]
y₂ = y[2,1:end]
did = Array{Union{Missing, String, Float64}}(missing, 4, 4)
did[1,2:4] = ["Period 2" "Period 1" "Diff"]
did[2:4,1] = ["Treated" "Not Treated" "Diff"]
did[2,2] = mean(y₂[treat .== 1])
did[3,2] = mean(y₁[treat .== 1])
did[2,3] = mean(y₂[treat .== 0])
did[3,3] = mean(y₁[treat .== 0])
did[4,2] = did[2,2] - did[3,2]
did[4,3] = did[2,3] - did[3,3]
did[2,4] = did[2,2] - did[2,3]
did[3,4] = did[3,2] - did[3,3]
did[4,4] = did[4,2] - did[4,3]
return did
end
y_11 = [Y1[1,1:end][(a.t .== 1).*(treat1 .== 0)]; Y1[1,1:end][(a.t .== 1).*(treat1 .== 1)]]
# full-time employment for type 1 restaurants prior to the minimum wage increase
y_12 = [Y2[1,1:end][(a.t .== 1).*(treat1 .== 0)]; Y2[1,1:end][(a.t .== 1).*(treat1 .== 1)]]
# full-time employment for type 1 restaurants after the minimum wage increase
treat2 = [fill(0, length(Y1[1,1:end][(a.t .== 1).*(treat1 .== 0)])); fill(1, length(Y1[1,1:end][(a.t .== 1).*(treat1 .== 1)]))]
did1 = did(hcat(y_11, y_12)', treat2)
did1[4,4]/did1[2,2]
y_13 = [Y1[2,1:end][(a.t .== 1).*(treat1 .== 0)]; Y1[2,1:end][(a.t .== 1).*(treat1 .== 1)]]
y_14 = [Y2[2,1:end][(a.t .== 1).*(treat1 .== 0)]; Y2[2,1:end][(a.t .== 1).*(treat1 .== 1)]]
treat2 = [fill(0, length(Y1[2,1:end][(a.t .== 1).*(treat1 .== 0)])); fill(1, length(Y1[2,1:end][(a.t .== 1).*(treat1 .== 1)]))]
did2 = did(hcat(y_13, y_14)', treat2)
did2[4,4]/did2[2,2]
y_21 = [Y1[1,1:end][(a.t .== 2).*(treat1 .== 0)]; Y1[1,1:end][(a.t .== 2).*(treat1 .== 1)]]
y_22 = [Y2[1,1:end][(a.t .== 2).*(treat1 .== 0)]; Y2[1,1:end][(a.t .== 2).*(treat1 .== 1)]]
treat2 = [fill(0, length(Y1[1,1:end][(a.t .== 2).*(treat1 .== 0)])); fill(1, length(Y1[1,1:end][(a.t .== 2).*(treat1 .== 1)]))]
did3 = did(hcat(y_21, y_22)', treat2)
did3[4,4]/did3[2,2]
y_23 = [Y1[2,1:end][(a.t .== 2).*(treat1 .== 0)]; Y1[2,1:end][(a.t .== 2).*(treat1 .== 1)]]
y_24 = [Y2[2,1:end][(a.t .== 2).*(treat1 .== 0)]; Y2[2,1:end][(a.t .== 2).*(treat1 .== 1)]]
treat2 = [fill(0, length(Y1[2,1:end][(a.t .== 2).*(treat1 .== 0)])); fill(1, length(Y2[1,1:end][(a.t .== 2).*(treat1 .== 1)]))]
did4 = did(hcat(y_23, y_24)', treat2)
did4[4,4]/did4[2,2]
# 12.6