chapter8.jl
# Chapter 8
#import Pkg
#Pkg.add("Plots")
#Pkg.add("Optim")
#Pkg.add("StatsKit")
#Pkg.add("GLM")
#Pkg.add("DataFrames")
#Pkg.add("StatsPlots")
#Pkg.add("StatFiles")
#Pkg.add("NLSolversBase")
#Pkg.add("Distributions")
using Random
using Statistics
using Plots
using Optim
using NLSolversBase
using StatsKit
using CSV
using GLM
using DataFrames
using LinearAlgebra
using StatsPlots
using StatFiles
using Distributions
# 8.1
# 8.2
# 8.2.1
path = "ENTER PATH NAME"
x = DataFrame(CSV.read(string(path,"/NFLQ1.csv")))
yardline_100 = Vector{Union{Missing, Float64}}(missing, 98213)
ep = Vector{Union{Missing, Float64}}(missing, 98213)
for i = 1:98213
if !(x.yardline_100[i] == "NA" || x.ep[i] == "NA")
yardline_100[i] = parse(Float64,x.yardline_100[i])
ep[i] = parse(Float64, x.ep[i])
end
end
x1 = DataFrame(yardline_100 = yardline_100, ep = ep)
x1 = dropmissing!(x1)
ave_ep = Vector{Union{Missing, Float64}}(missing, 99)
for i = 1:99
ave_ep[i] = mean(x1.ep[x1.yardline_100 .== i])
end
scatter(1:99, ave_ep)
hline!(fill(0.0,99))
xlabel!("Yards to Goal")
ylabel!("Expected Points")
# 8.2.2
# 8.2.3
# 8.2.4
# 8.2.5
# 8.2.6
q_fun = function(θ, Ep)
θ₁ = θ[1]
θ₂ = θ[2]
θ₃ = θ[3]
θ₄ = θ[4]
Ep_run = Ep[1:end,1]
Ep_pass = Ep[1:end,2]
Ep_fail = Ep[1:end,3]
V_rr = -(1 - θ₁).*Ep_fail + θ₁.*Ep_run
V_rp = -(1 - θ₂).*Ep_fail + θ₂.*Ep_run
V_pr = -(1 - θ₃).*Ep_fail + θ₃.*Ep_pass
V_pp = -(1 - θ₄).*Ep_fail + θ₄.*Ep_pass
q_o = (V_rp - V_rr)./((V_rp - V_rr) + (V_pr - V_pp))
q_d = (V_pr - V_rr)./((V_pr - V_rr) + (V_rp - V_pp))
for i = 1:length(q_o)
q_o[i] = minimum([maximum([q_o[i]; 0]); 1])
end
for i = 1:length(q_d)
q_d[i] = minimum([maximum([q_d[i]; 0]); 1])
end
# forcing results to be probabilities
return (q_o = q_o, q_d = q_d)
end
# 8.2.7
rng = MersenneTwister(123456789)
N = 2000
θ = [0.2; 0.8; 0.5; 0.1]
Y = 50 .+ rand!(rng, zeros(N)).*20
Y_k = Y .- (randn!(rng, zeros(N)) .+ 35)
Y_p = Y .- (randn!(rng, zeros(N)) .+ 15)
Y_r = Y .- (randn!(rng, zeros(N)) .+ 3)
for i = 1:N
if Y_k[i] < 0
Y_k[i] = 20
end
if Y_p[i] < 0
Y_p[i] = 0
end
if Y_r[i] < 0
Y_r[i] = 0
end
end
EP = function(x)
5 - 7*x/100
end
q₃ = q_fun(θ,hcat(EP.(Y_r), EP.(Y_p), EP.(100 .- Y_k)))
q₄ = q_fun(θ,hcat(EP.(Y_r), EP.(Y_p), EP.(100 .- Y)))
tab_res = Array{Union{Missing, String, Float64}}(missing,7,5)
tab_res[1,1:end] = ["" "3rd O Pass" "3rd D Pass" "4th O Pass" "4th D Pass"]
tab_res[1:end,1] = ["" "Min." "1st Qu." "Median" "3rd Qu" "Mean" "Max."]
list = [q₃.q_o, q₃.q_d, q₄.q_o, q₄.q_d]
for i = 1:4
tab_res[2,i+1] = minimum(list[i])
tab_res[3:5,i+1] = quantile(list[i], [0.25;0.5;0.75])
tab_res[6,i+1] = mean(list[i])
tab_res[7,i+1] = maximum(list[i])
end
tab_res
Pass3 = rand!(rng, zeros(N)) .< q₃.q_o
success3 = rand!(rng, zeros(N)) .< Pass3.*(θ₃.*(1 .- q₃.q_d) + θ₄.*q₃.q_d) + (1 .- Pass3).*(θ₁.*(1 .- q₃.q_d) + θ₂.*q₃.q_d)
play = Vector{Union{Missing, String}}(missing,N)
for i = 1:N
if Pass3[i] == 1
play[i] = "pass"
else
play[i] = "run"
end
end
# Fourth down
Pass4 = rand!(rng, zeros(N)) .< q₄.q_o
success4 = rand!(rng, zeros(N)) .< Pass4.*(θ₃.*(1 .- q₄.q_d) + θ₄.*q₄.q_d) + (1 .- Pass4).*(θ₁.*(1 .- q₄.q_d) + θ₂.*q₄.q_d)
mean(success3)
mean(success4)
# 8.3
# 8.3.1
# 8.3.2
rng = MersenneTwister(123456789)
N = 500
a = fill(2, N)
b = 3
x = rand!(rng, zeros(N))
υ = randn!(rng, zeros(N)).*2
y = a + b.*x + υ
f_1mom_sos = function(β, y, X)
mean((y - X*β).^2)
end
f_1mom = function(β)
return f_1mom_sos(β, y, X)
end
f_2mom_sos = function(β, σ, y, X)
mean(((y - X*β).^2 .- σ^2).^2)
end
f_2mom = function(par)
σ = exp(par[1])
β = par[2:end]
return f_2mom_sos(β, σ, y, X)
end
global y = y
global X = hcat(ones(N), x)
init = [2.0; 3.0]
a = optimize(f_1mom, init)
a.minimizer
init = [log(2); 2.0; 3.0]
b = optimize(f_2mom, init)
exp(b.minimizer[1])
b.minimizer[2:end]
f_gmm_simple_sos = function(β, σ, y, X)
mean((y - X*β).^2) + mean(((y - X*β).^2 .- σ^2).^2)
end
f_gmm_simple = function(par)
σ = exp(par[1])
β = par[2:end]
return f_gmm_simple_sos(β, σ, y, X)
end
c = optimize(f_gmm_simple, init)
exp(c.minimizer[1])
c.minimizer[2:end]
# 8.3.3
# 8.3.4
gmm = function(G, K)
N = size(G)[2]
if K == size(G)[1]
# a check that the matrix G has K rows
g = []
for k = 1:K
g = [g; mean(G[k,1:end])]
end
try
W = inv((G*G')./N)
return g'*W*g
catch
println("Warning: W is the identity")
return g'g
end
else
return "ERROR: Incorrect dimension"
end
end
f_ols_gmm_sos = function(β, σ, y, X)
g₁ = y - X*β
g₂ = (y - X*β).^2 .- σ^2
return gmm(hcat(g₁, g₂)', 2)
end
f_ols_gmm = function(par)
σ = exp(par[1])
β = par[2:end]
return f_ols_gmm_sos(β, σ, y, X)
end
d = optimize(f_ols_gmm, init)
exp(d.minimizer[1])
d.minimizer[2:end]
# 8.3.5
path = "ENTER PATH NAME"
x = CSV.read(string(path,"/nls.csv"))
lwage76_temp = Vector{Union{Missing, Float64}}(missing,length(x.lwage76))
wage76_temp = Vector{Union{Missing, Float64}}(missing,length(x.wage76))
for i in 1:length(lwage76_temp)
if length(x.lwage76[i]) > 1
lwage76_temp[i] = parse(Float64, x.lwage76[i])
wage76_temp[i] = parse(Float64, x.wage76[i])
end
end
x.lwage76 = lwage76_temp
x.wage76 = wage76_temp
x1 = DataFrame(x)
x1 = dropmissing(x1)
x1.age2 = x1.age76.^2
x1.exp1 = x1.age76 - x1.ed76 - fill(6,3010)
x1.exp2 = (x1.exp1.^2)./100
f_iv_gmm_sos = function(β, y, X, Z)
g₁ = Z[1:end,1].*(y - X*β)
g₂ = Z[1:end,2].*(y - X*β)
return gmm(hcat(g₁, g₂)', 2)
end
f_iv_gmm = function(par)
return f_iv_gmm_sos(par, y, X, Z)
end
global y = x1.lwage76
global X = hcat(ones(length(y)), x1.ed76, x1.exp1, x1.exp2, x1.black, x1.reg76r, x1.smsa76r, x1.smsa66r, x1.reg662, x1.reg663, x1.reg664, x1.reg665, x1.reg666, x1.reg667, x1.reg668, x1.reg669)
global Z = hcat(x1.nearc4, x1.momdad14)
β̂ = inv(X'X)X'y
a = optimize(f_iv_gmm, β̂)
a.minimizer[2]
# 8.4
# 8.4.1
# 8.4.2
p3_sos = function(θ, Ep, s, play)
θ₁ = θ[1]
θ₂ = θ[2]
θ₃ = θ[3]
θ₄ = θ[4]
q₃ = q_fun(θ, Ep)
Pass = play .== "pass"
g₁ = Pass.*(s .- θ₃.*(1 .- q₃.q_d) - θ₄.*q₃.q_d)
g₂ = (1 .- Pass).*(s .- θ₁.*(1 .- q₃.q_d) - θ₂.*q₃.q_d)
g₃ = Pass .- q₃.q_o
G = hcat(g₁, g₂, g₃)'
return gmm(G, 3)
end
p₃ = function(par)
θ = exp.(par)./(1 .+ exp.(par))
return p3_sos(θ, Ep, s, play)
end
global Ep = hcat(EP.(Y_r), EP.(Y_p), EP.(100 .- Y_k))
global s = success3
global play = play
init = log.(2*θ)
exp.(init)./(1 .+ exp.(init))
a = optimize(p₃, init)
exp.(a.minimizer)./(1 .+ exp.(a.minimizer))
# 8.5
# 8.5.1
path = "ENTER PATH NAME"
x = DataFrame(CSV.read(string(path,"/NFLQ1.csv")))
id = 1:98213
yardline_100 = Vector{Union{Missing, Float64}}(missing, 98213)
ep = Vector{Union{Missing, Float64}}(missing, 98213)
down = Vector{Union{Missing, Float64}}(missing, 98213)
for i = 1:98213
if !(x.yardline_100[i] == "NA" || x.ep[i] == "NA" || x.down[i] == "NA")
yardline_100[i] = parse(Float64,x.yardline_100[i])
ep[i] = parse(Float64, x.ep[i])
down[i] = parse(Float64, x.down[i])
end
end
x.year = year.(x.game_date)
res_pos = Vector{Union{Missing, String}}(missing, 98213)
res_ep = Vector{Union{Missing, Float64}}(missing, 98213)
res_game = Vector{Union{Missing, Int64}}(missing, 98213)
res_down = Vector{Union{Missing, Float64}}(missing, 98213)
for i = 2:98213
res_pos[i-1] = x.posteam[i]
res_ep[i-1] = ep[i]
res_game[i-1] = x.game_id[i]
res_down[i-1] = down[i]
end
for i = 1:98213
if !(ismissing(res_pos[i]) || ismissing(x.posteam[i]))
if x.posteam[i] == res_pos[i]
res_ep[i] = res_ep[i]
else
res_ep[i] = -res_ep[i]
end
end
end
diff_ep = res_ep - ep
succ = Vector{Union{Missing, Int64}}(missing, 98213)
for i = 1:98213
if !(ismissing(x.game_id[i]) || ismissing(res_game[i]) || ismissing(x.posteam[i]) || ismissing(res_pos[i]) || ismissing(down[i]) || ismissing(res_down[i]))
if !(x.game_id[i] == res_game[i])
diff_ep[i] = missing
end
if res_down[i] == 1 && x.posteam[i] == res_pos[i] && down[i] == 3
succ[i] = 1
elseif res_down[i] == 4 || down[i] == 4 || down[i] == 1
succ[i] = 0
end
end
end
pct_field = yardline_100./100
x1 = DataFrame(id = id, yardline_100 = yardline_100, ep = ep, succ = succ, down = down, year = x.year, pct_field = pct_field, ydstogo = x.ydstogo, play_type = x.play_type, diff_ep = diff_ep)
x1 = dropmissing!(x1)
# Note that this is a little different from the R code because it drops missing.
# 8.5.2
lm_run = lm(@formula(diff_ep ~ ydstogo + pct_field + year), x1[(x1.play_type .== "run").*(x1.down .== 3).*(x1.succ .== 1),1:end])
lm_pass = lm(@formula(diff_ep ~ ydstogo + pct_field + year), x1[(x1.play_type .== "pass").*(x1.down .== 3).*(x1.succ .== 1),1:end])
lm_punt = lm(@formula(diff_ep ~ ydstogo + pct_field + year), x1[x1.play_type .== "punt",1:end])
X = hcat(ones(53129), x1.ydstogo, x1.pct_field, x1.year)
x1.run_ep = x1.ep + X*coef(lm_run)
x1.pass_ep = x1.ep + X*coef(lm_pass)
x1.punt_ep = x1.ep + X*coef(lm_punt)
rng = MersenneTwister(123456789)
Θ̂ = Array{Union{Missing, Float64}}(missing, 4*30, 4)
global k = 1
for i = 1:4
tg = i # togo distance
for j = 1:30
yl = 29 + j # yardline
index = (x1.down .== 3).*(x1.yardline_100 .> yl).*(x1.yardline_100 .< (yl + 20)).*(x1.ydstogo .> (tg - 2)).*(x1.ydstogo .< (tg + 2)).*((x1.play_type .== "pass") + (x1.play_type .== "run"))
global Ep = hcat(x1.run_ep[index .== 1], x1.pass_ep[index .== 1], x1.punt_ep[index .== 1])
global s = x1.succ[index .== 1]
global play = x1.play_type[index .== 1]
init = randn!(rng, zeros(4)).*2
a = optimize(p₃, init, g_tol=1e-15)
Θ̂[k,1:end] = exp.(a.minimizer)./(1 .+ exp.(a.minimizer))
global k += 1
println(k)
end
end
Θ̂
# 8.5.3
lm_over = lm(@formula(ep ~ pct_field + pct_field^2), x1[x1.down.==1,1:end])
x1.over_ep = -hcat(ones(53129), 1 .- x1.pct_field, (1 .- x1.pct_field).^2)*coef(lm_over)
p₄ = function(θ, Ep)
θ₁ = θ[1]
θ₂ = θ[2]
θ₃ = θ[3]
θ₄ = θ[4]
q₄ = q_fun(θ, Ep)
p_4p = q₄.q_o.*(θ₃.*(1 .- q₄.q_d) + θ₄.*q₄.q_d)
p_4r = (1 .- q₄.q_o).*(θ₁.*(1 .- q₄.q_d) + θ₂.*q₄.q_d)
p_4 = p_4p + p_4r
Epgfi = p_4r.*Ep[1:end,1] + p_4p.*Ep[1:end,2] + (1 .- p_4).*Ep[1:end,3]
return (p_4 = p_4, Epgfi = Epgfi)
end
tab_res = Array{Union{Missing, Float64}}(missing, 4, 30)
prob_actual = Array{Union{Missing, Float64}}(missing, 4, 30)
prob_pred = Array{Union{Missing, Float64}}(missing, 4, 30)
global k = 1
for i = 1:4
tg = i # togo distance
for j = 1:30
yl = 29 + j # yardline
# create subset for analysis
index3 = (x1.down .== 3).*(x1.yardline_100 .> yl).*(x1.yardline_100 .< (yl + 20)).*(x1.ydstogo .> (tg - 2)).*(x1.ydstogo .< (tg + 2)).*((x1.play_type .== "pass") + (x1.play_type .== "run"))
y = x1[index3 .== 1,1:end]
# determine predicted success
succ4 = p₄(Θ̂[k,1:end], hcat(y.run_ep, y.pass_ep, y.over_ep))
index4 = (x1.down .== 4).*(x1.yardline_100 .> yl).*(x1.yardline_100 .< (yl + 20)).*(x1.ydstogo .> (tg - 2)).*(x1.ydstogo .< (tg + 2))
z = x1[index4 .== 1,1:end]
go = Vector{Union{Missing, Float64}}(missing, size(z)[1])
for l = 1:size(z)[1]
if z.play_type[l] == "run" || z.play_type[l] == "pass"
go[l] = 1
elseif z.play_type[l] == "punt"
go[l] = 0
end
end
# relative value of punting
tab_res[i, j] = mean(y.punt_ep - succ4.Epgfi)
# predicted probability of going for it
prob_pred[i, j] = mean(y.punt_ep - succ4.Epgfi .< 0)
# actual probability of going for it
prob_actual[i, j] = mean(skipmissing(go))
global k += 1
println(k)
end
end
tab_res
tab_res2 = Array{Union{Missing, String, Float64}}(missing,7,5)
tab_res2[1,1:end] = ["" "1 To Go" "2 To Go" "3 To Go" "4 To Go"]
tab_res2[1:end,1] = ["" "Min." "1st Qu." "Median" "3rd Qu" "Mean" "Max."]
for i = 1:4
tab_res2[2,i+1] = minimum(tab_res[i,1:end])
tab_res2[3:5,i+1] = quantile(tab_res[i,1:end], [0.25;0.5;0.75])
tab_res2[6,i+1] = mean(tab_res[i,1:end])
tab_res2[7,i+1] = maximum(tab_res[i,1:end])
end
tab_res2
# 8.5.4
prob_diff = Vector{Union{Missing, Float64}}(missing, 4*30)
global k = 1
for i = 1:4
for j = 1:30
prob_diff[k] = prob_actual[i,j] - prob_pred[i,j]
global k += 1
end
end
histogram(prob_diff)
vline!(zeros(4*30))
xlabel!("Actual Prob - Pred Prob")