chapter12.jl

# Chapter 12

#import Pkg

#Pkg.add("Plots")

#Pkg.add("Optim")

#Pkg.add("StatsKit")

#Pkg.add("GLM")

#Pkg.add("DataFrames")

#Pkg.add("StatsPlots")

#Pkg.add("StatFiles")

#Pkg.add("NLSolversBase")

#Pkg.add("Distributions")

using Random

using Statistics

using Plots

using Optim

using NLSolversBase

using StatsKit

using CSV

using GLM

using DataFrames

using LinearAlgebra

using StatsPlots

using StatFiles

using Distributions

# 12.1

# 12.2

# 12.2.1

Random.seed!(123456789)

N = 10000

μ = [0; 10; -1; 3]

Σ = zeros(4,4) + Diagonal([2; 1; 3; 1])

x = rand(MvNormal(μ, Σ), N)'

α = 0.4

ranN = rand(N)

y₁ = zeros(N)

y₂ = zeros(N)

for i = 1:N

if ranN[i] < α

y₁[i] = x[i,1]

y₂[i] = x[i,3]

else

y₁[i] = x[i,2]

y₂[i] = x[i,4]

end

density(y₁)

density!(y₂, style = :dash)

# 12.2.2

# some useful functions

find_dens = function(h, x)

ls_x = (h.x .- x).^2

a = which_min(ls_x)

min_a = maximum([a - 5; 1])

max_a = minimum([a + 5; 512])

return mean(h.density[min_a:max_a])

end

which_min = function(x)

min_x = minimum(x)

for i = 1:length(x)

if x[i] == min_x

return i

end

return "Error"

end

which_max = function(x)

max_x = maximum(x)

for i = 1:length(x)

if x[i] == max_x

return i

end

return "Error"

end

q_y₂ = quantile(y₂, 0.5)

a = find_dens(kde(y₂), q_y₂)

b = pdf(Normal(3), q_y₂)

c = pdf(Normal(-1, √3), q_y₂)

(a - b)/(c - b)

# 12.2.3

min1 = minimum([y₁; y₂])

max1 = maximum([y₁; y₂])

K₁ = 10

K₂ = 7

ϵ = 1e-10

diff1 = (max1 - min1)/K₁

diff2 = (max1 - min1)/K₂

prob_mat = Array{Union{Missing, Float64}}(missing, K₂+1, K₁+1)

# conditional probability

prob_mat_x = Array{Union{Missing, Float64}}(missing, K₂+1, 3)

# actual hidden type conditional probability

low1 = min1 - ϵ

high1 = low1 + ϵ + diff1

for k₁ = 1:K₁

low2 = min1 - ϵ

high2 = low2 + ϵ + diff2

prob_mat[1,1+k₁] = (low1 + high1)/2

y₁_c = y₁[(y₂ .> low1).*(y₂ .< high1)]

for k₂ = 1:K₂

prob_mat[1 + k₂, 1] = (low2 + high2)/2

prob_mat[1 + k₂, 1 + k₁] = mean((y₁_c .> low2).*(y₁_c .< high2))

prob_mat_x[1 + k₂, 2] = mean((x[1:end,1] .> low2).*(x[1:end,1] .< high2))

prob_mat_x[1 + k₂, 3] = mean((x[1:end,2] .> low2).*(x[1:end,2] .< high2))

# probabilities conditional on hidden type

low2 = high2 - ϵ

high2 = low2 + ϵ + diff2

#println(k₂)

end

low1 = high1 - ϵ

high1 = low1 + ϵ + diff1

end

prob_mat_x[1:end,1] = prob_mat[1:end,1]

prob_mat

prob_mat_x

scatter(prob_mat[3,2:8],prob_mat[7,2:8])

scatter!(prob_mat_x[[3;7],2],prob_mat_x[[3;7],3], shape = :rect)

xlabel!("y₁ is low")

ylabel!("y₁ is high")

# 12.3

# 12.3.1

# 12.3.2

Random.seed!(123456789)

N = 10000

υ = 1:4

p = [0.2; 0.5; 0.25; 0.05]

ranN = rand(N)

θ = zeros(N)

for i = 1:N

if ranN[i] < p[1]

θ[i] = υ[1]

elseif ranN[i] < sum(p[1:2])

θ[i] = υ[2]

elseif ranN[i] < sum(p[1:3])

θ[i] = υ[3]

else

θ[i] = υ[4]

end

e₁ = randn(N).*0.3

e₂ = randn(N).*0.3

x₁ = 1 .+ 0.25.*θ + e₁

x₂ = 0.5 .+ 0.1.*θ + e₂

quantile(x₁, [0, 0.25, 0.5, 0.75, 1])

quantile(x₂, [0, 0.25, 0.5, 0.75, 1])

# 12.3.3

# 12.3.4

# 12.3.5

# 12.3.6

mixture = function(X; centers=2, tol=1e-09, maxiter=500, verb=false)

# the function accets either a matrix or the number of hidden types

if length(centers) > 1

K = size(centers)[1]

else

K = centers

end

# initial types

t = assignments(kmeans!(X', centers'))

T = zeros(size(X)[1], K)

for k = 1:K

T[1:end,k] = t .== k

end

# determine the initial prior estimates

g₁ = []

for k = 1:K

g₁ = [g₁; mean(T[1:end,k])]

end

g₁

# loop to calculate the prior and posterior

diff = sum(abs.(1 .- g₁))

i = 0

while diff > tol && i < maxiter

# determine likelihoods

H = H_fun(X, t, K)

# solve for the posteriors

global γ̂ = γ_fun(X, H, g₁)

# update that categorization of types

for i = 1:size(X)[1]

t[i] = which_max(γ̂[i,1:end])

end

# determine the initial prior estimates

g₂ = []

for k = 1:K

g₂ = [g₂; mean(γ̂[1:end,k])]

end

g₂ = g₂./sum(g₂)

# check for convergence

diff = sum(abs.(g₁ - g₂))

g₁ = g₂

i += 1

if verb

println(diff)

println(i)

end

return (g = g₁, γ̂ = γ̂, t = t)

end

H_fun = function(X, t, K)

H = []

for j = 1:size(X)[2]

for k = 1:K

if sum(t .== k) > 0

H = [H; kde(X[t .== k,j], npoints=512, boundary=(minimum(X), maximum(X)))]

else

H = [H; kde(ones(512), npoints=512)]

end

H[end].density = H[end].density./sum(H[end].density)

end

return H

end

γ_fun = function(X, H, g)

N = size(X)[1]

J = size(X)[2]

K = length(g)

γ = zeros(N, K)

for i = 1:N

for k = 1:K

ĥ = log(g[k])

for j = 1:J

ĥ = ĥ + log(find_dens(H[K*(j-1)+k], X[i,j]))

end

γ[i,k] = exp(ĥ)

end

γ[i,1:end] = γ[i,1:end]./sum(γ[i,1:end])

end

return γ

end

centers = hcat(1 .+ 0.25.*υ, 0.5 .+ 0.1.*υ)

# columns is the number of signals (2 in this case)

# rows is the number types (4 in this case)

X = hcat(x₁, x₂)

a = mixture(X, centers=centers, verb=true)

mean(x₁[a.t .== 1])

mean(x₁[a.t .== 2])

mean(x₁[a.t .== 3])

mean(x₁[a.t .== 4])

plot(kde(x₁[a.t .== 1]))

plot!(kde(x₁[a.t .== 2]))

plot!(kde(x₁[a.t .== 3]))

plot!(kde(x₁[a.t .== 4]))

xlabel!("x₁")

ylabel!("Density")

# 12.4.2

path = "ENTER PATH HERE"

x = DataFrame(load(string(path,"/pubtwins.dta")))

x_f = x[ismissing.(x.first) .== 0,1:end] # first twin

x_s = x[ismissing.(x.first),1:end] # second twin

# educ "Twin1 Education"

# educt "Twin1 Report of Twin2 Education"

# educ_t "Twin2 Education"

# educt_t "Twin2 Report of Twin1 Education"

# the data is written out in pairs for each set of twins

x_f1 = dropmissing(DataFrame(dlwage = x_f.dlwage, deduc = x_f.deduc, duncov = x_f.duncov, dmaried = x_f.dmaried, dtenure = x_f.dtenure))

X = hcat(ones(340), x_f.deduc)

y = x_f.dlwage

β₁ = inv(X'X)*X'y

X = hcat(ones(333), x_f1.deduc, x_f1.duncov, x_f1.dmaried, x_f1.dtenure)

y = x_f1.dlwage

β₂ = inv(X'X)*X'y

scatter(x_f.deduc, x_f.dlwage, legend = :none)

plot!(x_f.deduc, hcat(ones(340), x_f.deduc)*β₁)

xlabel!("Diff Years of Education")

ylabel!("Diff of Log Wages")

# 12.4.3

y_f = hcat(x_f.educ, x_f.educt_t)

y_s = hcat(x_s.educ, x_s.educt_t)

y = vcat(y_f, y_s)

centers = hcat([12.0; 14.0; 16.0; 18.0], [12.0; 14.0; 16.0; 18.0])

a = mixture(y, centers=centers, verb=true)

plot(kde(y_f[a.t .== 1]))

plot!(kde(y_f[a.t .== 2]))

plot!(kde(y_f[a.t .== 3]))

plot!(kde(y_f[a.t .== 4]))

xlabel!("Education Level")

ylabel!("Density")

# 12.4.4

ed = [12.0; 14.0; 16.0; 18.0]

x_f.deduc = a.γ̂[1:340,1:end]*ed - a.γ̂[341:680,1:end]*ed

scatter(a.γ̂[1:340,1:end]*ed, a.γ̂[341:680,1:end]*ed)

X = hcat(ones(340), x_f.deduc)

y = x_f.dlwage

β₃ = inv(X'X)*X'y

y = x_f.dlwage[1]

X = [1 x_f.deduc[1] x_f.duncov[1] x_f.dmaried[1] x_f.dtenure[1]]

for i = 2:340

if !ismissing(x_f.dtenure[i])

y = [y; x_f.dlwage[i]]

X = [X; [1 x_f.deduc[i] x_f.duncov[i] x_f.dmaried[i] x_f.dtenure[i]]]

end

β₄ = inv(X'X)*X'y

# 12.5

# 12.5.1

path = "ENTER PATH HERE"

x = DataFrame(CSV.read(string(path,"/cardkrueger.csv")))

EMPFT = Vector{Union{Missing, Float64}}(missing, 410)

EMPPT = Vector{Union{Missing, Float64}}(missing, 410)

EMPFT2 = Vector{Union{Missing, Float64}}(missing, 410)

EMPPT2 = Vector{Union{Missing, Float64}}(missing, 410)

for i = 1:410

if !(x.EMPFT[i] == "." || x.EMPPT[i] == "." || x.EMPFT2[i] == "." || x.EMPPT2[i] == ".")

EMPFT[i] = parse(Float64, x.EMPFT[i])

EMPPT[i] = parse(Float64, x.EMPPT[i])

EMPFT2[i] = parse(Float64, x.EMPFT2[i])

EMPPT2[i] = parse(Float64, x.EMPPT2[i])

end

x1 = DataFrame(SHEET = x.SHEET, FT = EMPFT, PT = EMPPT, STATE = x.STATE, TIME = ones(410))

x1.SHEET[x1.SHEET .== 407] = [4071; 4072]

# there is an issue with the labeling in the data

# two firms with the same label

x2 = DataFrame(SHEET = x.SHEET, FT = EMPFT2, PT = EMPPT2, STATE = x.STATE, TIME = ones(410) .+ 1)

x2.SHEET[x2.SHEET .== 407] = [4071; 4072]

x3 = vcat(x1, x2)

Y = hcat(x3.FT[x3.TIME .== 1], x3.PT[x3.TIME .== 1], x3.FT[x3.TIME .== 2], x3.PT[x3.TIME .== 2])'

treat = x3.STATE[x3.TIME .== 1] .== 1

index_na = []

for j = 1:410

if !ismissing(Y[1,j]) && !ismissing(Y[2,j]) && !ismissing(Y[3,j]) && !ismissing(Y[4,j])

index_na = [index_na; j]

end

Y1 = Y[1:2,index_na]

Y2 = Y[3:4,index_na]

treat1 = treat[index_na]

quantile(Y1[1,1:end], [0; 0.25; 0.5; 0.75; 1])

mean(Y1[1,1:end])

quantile(Y1[2,1:end], [0; 0.25; 0.5; 0.75; 1])

mean(Y1[2,1:end])

# 12.5.2

p₁ = [2.0; 2.0; 2.0; 6.0]

p₂ = [11.0; 17.0; 25.0; 11.0]

a = mixture(hcat(Y1[1,1:end], Y1[2,1:end]), centers = hcat(p₁, p₂), verb=true, tol=1e-15)

a.g

# The mixture model ends up with a somewhat different set of types, relative to the R model.

# The result of which is that the treatment effect results are different.

# 12.5.3

scatter(Y1[1,1:end][a.t .== 1], Y1[2,1:end][a.t .== 1])

scatter!(Y1[1,1:end][a.t .== 2], Y1[2,1:end][a.t .== 2])

scatter!(Y1[1,1:end][a.t .== 3], Y1[2,1:end][a.t .== 3])

scatter!(Y1[1,1:end][a.t .== 4], Y1[2,1:end][a.t .== 4])

xlabel!("# Full Time")

ylabel!("# Part Time")

scatter(Y1[1,1:end][(a.t .== 1).*(treat1 .== 1)], Y1[2,1:end][(a.t .== 1).*(treat1 .== 1)], label="NJ")

scatter!(Y1[1,1:end][(a.t .== 1).*(treat1 .== 0)], Y1[2,1:end][(a.t .== 1).*(treat1 .== 0)], shape = :rect, label="PA")

xlabel!("# Full-Time (Before)")

ylabel!("# Part-Time (Before)")

scatter(Y2[1,1:end][(a.t .== 1).*(treat1 .== 1)], Y2[2,1:end][(a.t .== 1).*(treat1 .== 1)], label="NJ")

scatter!(Y2[1,1:end][(a.t .== 1).*(treat1 .== 0)], Y2[2,1:end][(a.t .== 1).*(treat1 .== 0)], shape = :rect, label="PA")

xlabel!("# Full-Time (After)")

ylabel!("# Part-Time (After)")

scatter(Y1[1,1:end][(a.t .== 2).*(treat1 .== 1)], Y1[2,1:end][(a.t .== 2).*(treat1 .== 1)], label="NJ")

scatter!(Y1[1,1:end][(a.t .== 2).*(treat1 .== 0)], Y1[2,1:end][(a.t .== 2).*(treat1 .== 0)], shape = :rect, label="PA")

xlabel!("# Full-Time (Before)")

ylabel!("# Part-Time (Before)")

scatter(Y2[1,1:end][(a.t .== 2).*(treat1 .== 1)], Y2[2,1:end][(a.t .== 2).*(treat1 .== 1)], label="NJ")

scatter!(Y2[1,1:end][(a.t .== 2).*(treat1 .== 0)], Y2[2,1:end][(a.t .== 2).*(treat1 .== 0)], shape = :rect, label="PA")

xlabel!("# Full-Time (After)")

ylabel!("# Part-Time (After)")

did = function(y, treat)

y₁ = y[1,1:end]

y₂ = y[2,1:end]

did = Array{Union{Missing, String, Float64}}(missing, 4, 4)

did[1,2:4] = ["Period 2" "Period 1" "Diff"]

did[2:4,1] = ["Treated" "Not Treated" "Diff"]

did[2,2] = mean(y₂[treat .== 1])

did[3,2] = mean(y₁[treat .== 1])

did[2,3] = mean(y₂[treat .== 0])

did[3,3] = mean(y₁[treat .== 0])

did[4,2] = did[2,2] - did[3,2]

did[4,3] = did[2,3] - did[3,3]

did[2,4] = did[2,2] - did[2,3]

did[3,4] = did[3,2] - did[3,3]

did[4,4] = did[4,2] - did[4,3]

return did

end

y_11 = [Y1[1,1:end][(a.t .== 1).*(treat1 .== 0)]; Y1[1,1:end][(a.t .== 1).*(treat1 .== 1)]]

# full-time employment for type 1 restaurants prior to the minimum wage increase

y_12 = [Y2[1,1:end][(a.t .== 1).*(treat1 .== 0)]; Y2[1,1:end][(a.t .== 1).*(treat1 .== 1)]]

# full-time employment for type 1 restaurants after the minimum wage increase

treat2 = [fill(0, length(Y1[1,1:end][(a.t .== 1).*(treat1 .== 0)])); fill(1, length(Y1[1,1:end][(a.t .== 1).*(treat1 .== 1)]))]

did1 = did(hcat(y_11, y_12)', treat2)

did1[4,4]/did1[2,2]

y_13 = [Y1[2,1:end][(a.t .== 1).*(treat1 .== 0)]; Y1[2,1:end][(a.t .== 1).*(treat1 .== 1)]]

y_14 = [Y2[2,1:end][(a.t .== 1).*(treat1 .== 0)]; Y2[2,1:end][(a.t .== 1).*(treat1 .== 1)]]

treat2 = [fill(0, length(Y1[2,1:end][(a.t .== 1).*(treat1 .== 0)])); fill(1, length(Y1[2,1:end][(a.t .== 1).*(treat1 .== 1)]))]

did2 = did(hcat(y_13, y_14)', treat2)

did2[4,4]/did2[2,2]

y_21 = [Y1[1,1:end][(a.t .== 2).*(treat1 .== 0)]; Y1[1,1:end][(a.t .== 2).*(treat1 .== 1)]]

y_22 = [Y2[1,1:end][(a.t .== 2).*(treat1 .== 0)]; Y2[1,1:end][(a.t .== 2).*(treat1 .== 1)]]

treat2 = [fill(0, length(Y1[1,1:end][(a.t .== 2).*(treat1 .== 0)])); fill(1, length(Y1[1,1:end][(a.t .== 2).*(treat1 .== 1)]))]

did3 = did(hcat(y_21, y_22)', treat2)

did3[4,4]/did3[2,2]

y_23 = [Y1[2,1:end][(a.t .== 2).*(treat1 .== 0)]; Y1[2,1:end][(a.t .== 2).*(treat1 .== 1)]]

y_24 = [Y2[2,1:end][(a.t .== 2).*(treat1 .== 0)]; Y2[2,1:end][(a.t .== 2).*(treat1 .== 1)]]

treat2 = [fill(0, length(Y1[2,1:end][(a.t .== 2).*(treat1 .== 0)])); fill(1, length(Y2[1,1:end][(a.t .== 2).*(treat1 .== 1)]))]

did4 = did(hcat(y_23, y_24)', treat2)

did4[4,4]/did4[2,2]

# 12.6