chapter10.jl

# Chapter 10


#import Pkg

#Pkg.add("Plots")

#Pkg.add("Optim")

#Pkg.add("StatsKit")

#Pkg.add("GLM")

#Pkg.add("DataFrames")

#Pkg.add("StatsPlots")

#Pkg.add("StatFiles")

#Pkg.add("NLSolversBase")

#Pkg.add("Distributions")

using Random

using Statistics

using Plots

using Optim

using NLSolversBase

using StatsKit

using CSV

using GLM

using DataFrames

using LinearAlgebra

using StatsPlots

using StatFiles

using Distributions


# 10.1

# 10.2

# 10.2.1

# 10.2.2


rng = MersenneTwister(123456789)

N = 1000

T = 2

a = fill(-2, N)

b = 3

d = -4

υ = [5; 6]

x₁ = rand!(rng, zeros(N))

x₂ = x₁ + rand!(rng, zeros(N)) # change in X over time

x = hcat(x₁, x₂)'

e = randn!(rng, zeros(T, N))

y = hcat(a, a)' + b.*x .+ d.*υ + e


# 10.2.3

diffy = y[2,1:end] - y[1,1:end]

diffx = x[2,1:end] - x[1,1:end]

data1 = DataFrame(diffy=diffy, diffx=diffx)

lm1 = lm(@formula(diffy ~ diffx), data1)


# 10.3

# 10.3.1

# 10.3.2


did = function(y, treat)

y₁ = y[1,1:end]

y₂ = y[2,1:end]

did = Array{Union{Missing, String, Float64}}(missing, 4, 4)

did[1,2:4] = ["Period 2" "Period 1" "Diff"]

did[2:4,1] = ["Treated" "Not Treated" "Diff"]

did[2,2] = mean(y₂[treat .== 1])

did[3,2] = mean(y₁[treat .== 1])

did[2,3] = mean(y₂[treat .== 0])

did[3,3] = mean(y₁[treat .== 0])

did[4,2] = did[2,2] - did[3,2]

did[4,3] = did[2,3] - did[3,3]

did[2,4] = did[2,2] - did[2,3]

did[3,4] = did[3,2] - did[3,3]

did[4,4] = did[4,2] - did[4,3]

return did

end


rng = MersenneTwister(123456789)

treat = rand!(rng, zeros(N)) .< 0.5

x₂ = x₁ + treat

# this time the change is 1 for the treated group

x = hcat(x₁, x₂)'

y = hcat(a, a)' + b.*x .+ d.*υ + e

did1 = did(y, treat)


# 10.4

# 10.4.1


path = "/Volumes/My WD/Book"

x = DataFrame(CSV.read(string(path,"/cardkrueger.csv")))

EMPFT = Vector{Union{Missing, Float64}}(missing, 410)

EMPPT = Vector{Union{Missing, Float64}}(missing, 410)

EMPFT2 = Vector{Union{Missing, Float64}}(missing, 410)

EMPPT2 = Vector{Union{Missing, Float64}}(missing, 410)

for i = 1:410

if !(x.EMPFT[i] == "." || x.EMPPT[i] == "." || x.EMPFT2[i] == "." || x.EMPPT2[i] == ".")

EMPFT[i] = parse(Float64, x.EMPFT[i])

EMPPT[i] = parse(Float64, x.EMPPT[i])

EMPFT2[i] = parse(Float64, x.EMPFT2[i])

EMPPT2[i] = parse(Float64, x.EMPPT2[i])

end

end

x1 = DataFrame(SHEET = x.SHEET, FT = EMPFT, PT = EMPPT, STATE = x.STATE, TIME = ones(410))

x1.SHEET[x1.SHEET .== 407] = [4071; 4072]

# there is an issue with the labeling in the data

# two firms with the same label


x2 = DataFrame(SHEET = x.SHEET, FT = EMPFT2, PT = EMPPT2, STATE = x.STATE, TIME = ones(410) .+ 1)

x2.SHEET[x2.SHEET .== 407] = [4071; 4072]


x3 = vcat(x1, x2)


x4 = dropmissing!(x3)


wage = []

for i = 1:410

if !(x.WAGE_ST[i] == ".")

wage = [wage; parse(Float64, x.WAGE_ST[i])]

end

end


histogram(wage, legend = :none)

vline!(fill(4.25, 410))

vline!(fill(5.05, 410))

xlabel!("Wage")

ylabel!("Frequency")



# 10.4.2

y = hcat(x4.FT[x4.TIME .== 1] + x4.PT[x4.TIME .== 1], x4.FT[x4.TIME .== 2] + x4.PT[x4.TIME .== 2])

treat = x4.STATE[x4.TIME .== 1] .== 1

did2 = did(y', treat) # note that y is y'



# 10.5

# 10.5.1

# 10.5.2

# 10.5.3

# 10.5.4

# 10.5.5


fe = function(Y; X = false, cross = true)

T = size(Y)[1]

N = size(Y)[2]

y = Y[1:end,1]

t = 1:T

for i = 2:N

y = vcat(y, Y[1:end,i])

t = vcat(t, 1:T)

end


# set up for different cases

if cross

c = fill(1, T)

for i = 2:N

c = vcat(c, fill(i, T))

end

end

if length(X) > 1

treat = X[1:end,1]

for i = 2:N

treat = vcat(treat, X[1:end,i])

end

end


# estimator

if cross && length(X) > 1

data1 = DataFrame(y = y, treat = treat, t = string.(t), c = string.(c))

lm1 = lm(@formula(y ~ treat + t + c), data1)

elseif length(X) > 1

data1 = DataFrame(y = y, treat = treat, t = string.(t))

lm1 = lm(@formula(y ~ treat + t), data1)

else

data1 = DataFrame(y = y, t = string.(t))

lm1 = lm(@formula(y ~ t), data1)

end


return lm1

end



rng = MersenneTwister(123456789)

N = 100

T = 10

α = rand!(rng, zeros(N))

γ = rand!(rng, zeros(T))

β = 3

ϵ = randn!(rng, zeros(T, N))

treat = rand!(rng, zeros(N)) .< 0.5

y = fill(α[1], T) + γ + ϵ[1:end,1]

for i = 2:N

y = hcat(y, fill(α[i], T) + γ + ϵ[1:end,i])

end

y[1,1:end] = y[1,1:end] + β.*treat

treat1 = zeros(T, N)

treat1[1,1:end] = treat


# standard estimator

lm1 = fe(y, X=treat1)

coef(lm1)[2]

# No individual fixed effect estimator

lm2 = fe(y, X=treat1, cross=false)

coef(lm2)[2]

# Adjusted estimator

y₀ = y[2:T,1:end]

α̂ = fill(mean(y[1:end,1]), T)

for i = 2:N

α̂ = hcat(α̂, fill(mean(y[1:end,i]), T))

end

y₂ = y - α̂

lm3 = fe(y₂, X=treat1, cross=false)

coef(lm3)[2]

# Two step estimator

lm4 = fe(y₀, cross=false) # adjust for time effects

α̂ = fill(mean(residuals(lm4)[1:(T-1)]), T)

for i = 2:N

α̂ᵢ = fill(mean(residuals(lm4)[(1 + (i-1)*(T-1)):(i*(T-1))]), T)

α̂ = hcat(α̂, α̂ᵢ)

end

y₃ = y - α̂

lm5 = fe(y₃, X=treat1, cross=false)

coef(lm5)[2]


# 10.6

# 10.6.1

path = "ENTER PATH HERE"

x = CSV.read(string(path,"/NLSY97Panel.csv"))

x_names = CSV.read(string(path,"/NLSY97Panel_names.csv"), header=false)

year = 1997:2014

year1 = [string.(97:99); "00"; "01"; "02"; "03"; "04"; "05"; "06"; "07"; "08"; "09"; string.(10:14)]

W = Array{Union{Missing, Float64}}(missing, 18, 8984)

Y = Array{Union{Missing, Float64}}(missing, 18, 8984)

for i = 1:18

hrs_name = string("CVC_HOURS_WK_YR_ALL_",year1[i],"_XRND")

inc_name = string("YINC_1700_",year[i])

for j = 1:148

if x_names[1][j] == hrs_name

for k = 1:8984

if x[k,j] >= 0

Y[i,k] = x[k,j]

end

end

end

if x_names[1][j] == inc_name

for k = 1:8984

if x[k,j] >= 0

W[i,k] = x[k,j]

end

end

end

end

end


rate_07 = W[11,1:end]./Y[11,1:end]


treat = Vector{Union{Missing, Int64}}(missing, 8984)

for i = 1:8984

if !(ismissing(rate_07[i]))

if rate_07[i] < 7.26

treat[i] = 1

else

treat[i] = 0

end

end

if !ismissing(W[11,i])

if W[11,i] == 0

treat[i] = 1

end

end

if !ismissing(Y[11,i])

if Y[11,i] == 0

treat[i] = 1

end

end

end


# 10.6.2

N = 600 # reduce the size for computational reasons

T = 18

treat1 = Array{Union{Missing, Int64}}(missing, T, N)

treat1[1:10,1:end] .= 0

for i = 11:T

treat1[i,1:end] = treat[1:N]

end

lm1 = fe(Y[1:end,1:N], X=treat1)

coef(lm1)[2]


N = 8984 # full data set

y₀ = Y[1:10,1:N]

α̂ = Array{Union{Missing, Float64}}(missing, T, N)

for i = 1:N

α̂[1:end,i] = fill(mean(skipmissing(y₀[1:end,i])), T)

end

y₁ = Y - α̂

treat1 = Array{Union{Missing, Int64}}(missing, T, N)

treat1[1:10,1:end] .= 0

for i = 11:T

treat1[i,1:end] = treat[1:N]

end

lm2 = fe(y₁, X=treat1, cross=false)

coef(lm2)[2]


lm3 = fe(y₀, cross=false)

y₀_res = y₀[1,1:end] .- coef(lm3)[1]

y₀_res = hcat(y₀_res, y₀[2,1:end] .- coef(lm3)[3])

y₀_res = hcat(y₀_res, y₀[3,1:end] .- coef(lm3)[4])

y₀_res = hcat(y₀_res, y₀[4,1:end] .- coef(lm3)[5])

y₀_res = hcat(y₀_res, y₀[5,1:end] .- coef(lm3)[6])

y₀_res = hcat(y₀_res, y₀[6,1:end] .- coef(lm3)[7])

y₀_res = hcat(y₀_res, y₀[7,1:end] .- coef(lm3)[8])

y₀_res = hcat(y₀_res, y₀[8,1:end] .- coef(lm3)[9])

y₀_res = hcat(y₀_res, y₀[9,1:end] .- coef(lm3)[10])

y₀_res = hcat(y₀_res, y₀[10,1:end] .- coef(lm3)[2])

y₀_res = y₀_res'


α̂ = fill(mean(skipmissing(y₀_res[1:end,1])), T)

for i = 2:N

α̂ᵢ = fill(mean(skipmissing(y₀_res[1:end,i])), T)

α̂ = hcat(α̂, α̂ᵢ)

end

y₃ = Y - α̂

lm4 = fe(y₃, X=treat1, cross=false)

coef(lm4)[2]


W_nm = W[1:end, ismissing.(treat) .== 0]

Y_nm = Y[1:end, ismissing.(treat) .== 0]

treat_nm = treat[ismissing.(treat) .== 0]

wage_07_nm = W_nm[11,treat_nm .== 1]./Y_nm[11,treat_nm .== 1]

for i = 1:length(wage_07_nm)

if !(ismissing(wage_07_nm[i]))

if isnan(wage_07_nm[i]) || Y_nm[11,treat_nm .== 1][i] == 0

wage_07_nm[i] = missing

end

end

end


c = mean(skipmissing(wage_07_nm)) # 2007 wage rate

d = mean(skipmissing(W_nm[14,treat_nm .== 1])) # 2010 income

e = mean(skipmissing(Y_nm[14,treat_nm .== 1])) # 2010 hours

d - c*(e + 270) # actual less counterfactual

# Note we may not be accounting for missing the same way as the R code.

# Also I calculated the average of the wage rate for the treated

# not the average income divided by average hours for the treated.



# 10.7