chapter10.jl
# Chapter 10
#import Pkg
#Pkg.add("Plots")
#Pkg.add("Optim")
#Pkg.add("StatsKit")
#Pkg.add("GLM")
#Pkg.add("DataFrames")
#Pkg.add("StatsPlots")
#Pkg.add("StatFiles")
#Pkg.add("NLSolversBase")
#Pkg.add("Distributions")
using Random
using Statistics
using Plots
using Optim
using NLSolversBase
using StatsKit
using CSV
using GLM
using DataFrames
using LinearAlgebra
using StatsPlots
using StatFiles
using Distributions
# 10.1
# 10.2
# 10.2.1
# 10.2.2
rng = MersenneTwister(123456789)
N = 1000
T = 2
a = fill(-2, N)
b = 3
d = -4
υ = [5; 6]
x₁ = rand!(rng, zeros(N))
x₂ = x₁ + rand!(rng, zeros(N)) # change in X over time
x = hcat(x₁, x₂)'
e = randn!(rng, zeros(T, N))
y = hcat(a, a)' + b.*x .+ d.*υ + e
# 10.2.3
diffy = y[2,1:end] - y[1,1:end]
diffx = x[2,1:end] - x[1,1:end]
data1 = DataFrame(diffy=diffy, diffx=diffx)
lm1 = lm(@formula(diffy ~ diffx), data1)
# 10.3
# 10.3.1
# 10.3.2
did = function(y, treat)
y₁ = y[1,1:end]
y₂ = y[2,1:end]
did = Array{Union{Missing, String, Float64}}(missing, 4, 4)
did[1,2:4] = ["Period 2" "Period 1" "Diff"]
did[2:4,1] = ["Treated" "Not Treated" "Diff"]
did[2,2] = mean(y₂[treat .== 1])
did[3,2] = mean(y₁[treat .== 1])
did[2,3] = mean(y₂[treat .== 0])
did[3,3] = mean(y₁[treat .== 0])
did[4,2] = did[2,2] - did[3,2]
did[4,3] = did[2,3] - did[3,3]
did[2,4] = did[2,2] - did[2,3]
did[3,4] = did[3,2] - did[3,3]
did[4,4] = did[4,2] - did[4,3]
return did
end
rng = MersenneTwister(123456789)
treat = rand!(rng, zeros(N)) .< 0.5
x₂ = x₁ + treat
# this time the change is 1 for the treated group
x = hcat(x₁, x₂)'
y = hcat(a, a)' + b.*x .+ d.*υ + e
did1 = did(y, treat)
# 10.4
# 10.4.1
path = "/Volumes/My WD/Book"
x = DataFrame(CSV.read(string(path,"/cardkrueger.csv")))
EMPFT = Vector{Union{Missing, Float64}}(missing, 410)
EMPPT = Vector{Union{Missing, Float64}}(missing, 410)
EMPFT2 = Vector{Union{Missing, Float64}}(missing, 410)
EMPPT2 = Vector{Union{Missing, Float64}}(missing, 410)
for i = 1:410
if !(x.EMPFT[i] == "." || x.EMPPT[i] == "." || x.EMPFT2[i] == "." || x.EMPPT2[i] == ".")
EMPFT[i] = parse(Float64, x.EMPFT[i])
EMPPT[i] = parse(Float64, x.EMPPT[i])
EMPFT2[i] = parse(Float64, x.EMPFT2[i])
EMPPT2[i] = parse(Float64, x.EMPPT2[i])
end
end
x1 = DataFrame(SHEET = x.SHEET, FT = EMPFT, PT = EMPPT, STATE = x.STATE, TIME = ones(410))
x1.SHEET[x1.SHEET .== 407] = [4071; 4072]
# there is an issue with the labeling in the data
# two firms with the same label
x2 = DataFrame(SHEET = x.SHEET, FT = EMPFT2, PT = EMPPT2, STATE = x.STATE, TIME = ones(410) .+ 1)
x2.SHEET[x2.SHEET .== 407] = [4071; 4072]
x3 = vcat(x1, x2)
x4 = dropmissing!(x3)
wage = []
for i = 1:410
if !(x.WAGE_ST[i] == ".")
wage = [wage; parse(Float64, x.WAGE_ST[i])]
end
end
histogram(wage, legend = :none)
vline!(fill(4.25, 410))
vline!(fill(5.05, 410))
xlabel!("Wage")
ylabel!("Frequency")
# 10.4.2
y = hcat(x4.FT[x4.TIME .== 1] + x4.PT[x4.TIME .== 1], x4.FT[x4.TIME .== 2] + x4.PT[x4.TIME .== 2])
treat = x4.STATE[x4.TIME .== 1] .== 1
did2 = did(y', treat) # note that y is y'
# 10.5
# 10.5.1
# 10.5.2
# 10.5.3
# 10.5.4
# 10.5.5
fe = function(Y; X = false, cross = true)
T = size(Y)[1]
N = size(Y)[2]
y = Y[1:end,1]
t = 1:T
for i = 2:N
y = vcat(y, Y[1:end,i])
t = vcat(t, 1:T)
end
# set up for different cases
if cross
c = fill(1, T)
for i = 2:N
c = vcat(c, fill(i, T))
end
end
if length(X) > 1
treat = X[1:end,1]
for i = 2:N
treat = vcat(treat, X[1:end,i])
end
end
# estimator
if cross && length(X) > 1
data1 = DataFrame(y = y, treat = treat, t = string.(t), c = string.(c))
lm1 = lm(@formula(y ~ treat + t + c), data1)
elseif length(X) > 1
data1 = DataFrame(y = y, treat = treat, t = string.(t))
lm1 = lm(@formula(y ~ treat + t), data1)
else
data1 = DataFrame(y = y, t = string.(t))
lm1 = lm(@formula(y ~ t), data1)
end
return lm1
end
rng = MersenneTwister(123456789)
N = 100
T = 10
α = rand!(rng, zeros(N))
γ = rand!(rng, zeros(T))
β = 3
ϵ = randn!(rng, zeros(T, N))
treat = rand!(rng, zeros(N)) .< 0.5
y = fill(α[1], T) + γ + ϵ[1:end,1]
for i = 2:N
y = hcat(y, fill(α[i], T) + γ + ϵ[1:end,i])
end
y[1,1:end] = y[1,1:end] + β.*treat
treat1 = zeros(T, N)
treat1[1,1:end] = treat
# standard estimator
lm1 = fe(y, X=treat1)
coef(lm1)[2]
# No individual fixed effect estimator
lm2 = fe(y, X=treat1, cross=false)
coef(lm2)[2]
# Adjusted estimator
y₀ = y[2:T,1:end]
α̂ = fill(mean(y[1:end,1]), T)
for i = 2:N
α̂ = hcat(α̂, fill(mean(y[1:end,i]), T))
end
y₂ = y - α̂
lm3 = fe(y₂, X=treat1, cross=false)
coef(lm3)[2]
# Two step estimator
lm4 = fe(y₀, cross=false) # adjust for time effects
α̂ = fill(mean(residuals(lm4)[1:(T-1)]), T)
for i = 2:N
α̂ᵢ = fill(mean(residuals(lm4)[(1 + (i-1)*(T-1)):(i*(T-1))]), T)
α̂ = hcat(α̂, α̂ᵢ)
end
y₃ = y - α̂
lm5 = fe(y₃, X=treat1, cross=false)
coef(lm5)[2]
# 10.6
# 10.6.1
path = "ENTER PATH HERE"
x = CSV.read(string(path,"/NLSY97Panel.csv"))
x_names = CSV.read(string(path,"/NLSY97Panel_names.csv"), header=false)
year = 1997:2014
year1 = [string.(97:99); "00"; "01"; "02"; "03"; "04"; "05"; "06"; "07"; "08"; "09"; string.(10:14)]
W = Array{Union{Missing, Float64}}(missing, 18, 8984)
Y = Array{Union{Missing, Float64}}(missing, 18, 8984)
for i = 1:18
hrs_name = string("CVC_HOURS_WK_YR_ALL_",year1[i],"_XRND")
inc_name = string("YINC_1700_",year[i])
for j = 1:148
if x_names[1][j] == hrs_name
for k = 1:8984
if x[k,j] >= 0
Y[i,k] = x[k,j]
end
end
end
if x_names[1][j] == inc_name
for k = 1:8984
if x[k,j] >= 0
W[i,k] = x[k,j]
end
end
end
end
end
rate_07 = W[11,1:end]./Y[11,1:end]
treat = Vector{Union{Missing, Int64}}(missing, 8984)
for i = 1:8984
if !(ismissing(rate_07[i]))
if rate_07[i] < 7.26
treat[i] = 1
else
treat[i] = 0
end
end
if !ismissing(W[11,i])
if W[11,i] == 0
treat[i] = 1
end
end
if !ismissing(Y[11,i])
if Y[11,i] == 0
treat[i] = 1
end
end
end
# 10.6.2
N = 600 # reduce the size for computational reasons
T = 18
treat1 = Array{Union{Missing, Int64}}(missing, T, N)
treat1[1:10,1:end] .= 0
for i = 11:T
treat1[i,1:end] = treat[1:N]
end
lm1 = fe(Y[1:end,1:N], X=treat1)
coef(lm1)[2]
N = 8984 # full data set
y₀ = Y[1:10,1:N]
α̂ = Array{Union{Missing, Float64}}(missing, T, N)
for i = 1:N
α̂[1:end,i] = fill(mean(skipmissing(y₀[1:end,i])), T)
end
y₁ = Y - α̂
treat1 = Array{Union{Missing, Int64}}(missing, T, N)
treat1[1:10,1:end] .= 0
for i = 11:T
treat1[i,1:end] = treat[1:N]
end
lm2 = fe(y₁, X=treat1, cross=false)
coef(lm2)[2]
lm3 = fe(y₀, cross=false)
y₀_res = y₀[1,1:end] .- coef(lm3)[1]
y₀_res = hcat(y₀_res, y₀[2,1:end] .- coef(lm3)[3])
y₀_res = hcat(y₀_res, y₀[3,1:end] .- coef(lm3)[4])
y₀_res = hcat(y₀_res, y₀[4,1:end] .- coef(lm3)[5])
y₀_res = hcat(y₀_res, y₀[5,1:end] .- coef(lm3)[6])
y₀_res = hcat(y₀_res, y₀[6,1:end] .- coef(lm3)[7])
y₀_res = hcat(y₀_res, y₀[7,1:end] .- coef(lm3)[8])
y₀_res = hcat(y₀_res, y₀[8,1:end] .- coef(lm3)[9])
y₀_res = hcat(y₀_res, y₀[9,1:end] .- coef(lm3)[10])
y₀_res = hcat(y₀_res, y₀[10,1:end] .- coef(lm3)[2])
y₀_res = y₀_res'
α̂ = fill(mean(skipmissing(y₀_res[1:end,1])), T)
for i = 2:N
α̂ᵢ = fill(mean(skipmissing(y₀_res[1:end,i])), T)
α̂ = hcat(α̂, α̂ᵢ)
end
y₃ = Y - α̂
lm4 = fe(y₃, X=treat1, cross=false)
coef(lm4)[2]
W_nm = W[1:end, ismissing.(treat) .== 0]
Y_nm = Y[1:end, ismissing.(treat) .== 0]
treat_nm = treat[ismissing.(treat) .== 0]
wage_07_nm = W_nm[11,treat_nm .== 1]./Y_nm[11,treat_nm .== 1]
for i = 1:length(wage_07_nm)
if !(ismissing(wage_07_nm[i]))
if isnan(wage_07_nm[i]) || Y_nm[11,treat_nm .== 1][i] == 0
wage_07_nm[i] = missing
end
end
end
c = mean(skipmissing(wage_07_nm)) # 2007 wage rate
d = mean(skipmissing(W_nm[14,treat_nm .== 1])) # 2010 income
e = mean(skipmissing(Y_nm[14,treat_nm .== 1])) # 2010 hours
d - c*(e + 270) # actual less counterfactual
# Note we may not be accounting for missing the same way as the R code.
# Also I calculated the average of the wage rate for the treated
# not the average income divided by average hours for the treated.
# 10.7