chapter8.jl

# Chapter 8

#import Pkg

#Pkg.add("Plots")

#Pkg.add("Optim")

#Pkg.add("StatsKit")

#Pkg.add("GLM")

#Pkg.add("DataFrames")

#Pkg.add("StatsPlots")

#Pkg.add("StatFiles")

#Pkg.add("NLSolversBase")

#Pkg.add("Distributions")

using Random

using Statistics

using Plots

using Optim

using NLSolversBase

using StatsKit

using CSV

using GLM

using DataFrames

using LinearAlgebra

using StatsPlots

using StatFiles

using Distributions

# 8.1

# 8.2

# 8.2.1

path = "ENTER PATH NAME"

x = DataFrame(CSV.read(string(path,"/NFLQ1.csv")))

yardline_100 = Vector{Union{Missing, Float64}}(missing, 98213)

ep = Vector{Union{Missing, Float64}}(missing, 98213)

for i = 1:98213

if !(x.yardline_100[i] == "NA" || x.ep[i] == "NA")

yardline_100[i] = parse(Float64,x.yardline_100[i])

ep[i] = parse(Float64, x.ep[i])

end

x1 = DataFrame(yardline_100 = yardline_100, ep = ep)

x1 = dropmissing!(x1)

ave_ep = Vector{Union{Missing, Float64}}(missing, 99)

for i = 1:99

ave_ep[i] = mean(x1.ep[x1.yardline_100 .== i])

end

scatter(1:99, ave_ep)

hline!(fill(0.0,99))

xlabel!("Yards to Goal")

ylabel!("Expected Points")

# 8.2.2

# 8.2.3

# 8.2.4

# 8.2.5

# 8.2.6

q_fun = function(θ, Ep)

θ₁ = θ[1]

θ₂ = θ[2]

θ₃ = θ[3]

θ₄ = θ[4]

Ep_run = Ep[1:end,1]

Ep_pass = Ep[1:end,2]

Ep_fail = Ep[1:end,3]

V_rr = -(1 - θ₁).*Ep_fail + θ₁.*Ep_run

V_rp = -(1 - θ₂).*Ep_fail + θ₂.*Ep_run

V_pr = -(1 - θ₃).*Ep_fail + θ₃.*Ep_pass

V_pp = -(1 - θ₄).*Ep_fail + θ₄.*Ep_pass

q_o = (V_rp - V_rr)./((V_rp - V_rr) + (V_pr - V_pp))

q_d = (V_pr - V_rr)./((V_pr - V_rr) + (V_rp - V_pp))

for i = 1:length(q_o)

q_o[i] = minimum([maximum([q_o[i]; 0]); 1])

end

for i = 1:length(q_d)

q_d[i] = minimum([maximum([q_d[i]; 0]); 1])

end

# forcing results to be probabilities

return (q_o = q_o, q_d = q_d)

end

# 8.2.7

rng = MersenneTwister(123456789)

N = 2000

θ = [0.2; 0.8; 0.5; 0.1]

Y = 50 .+ rand!(rng, zeros(N)).*20

Y_k = Y .- (randn!(rng, zeros(N)) .+ 35)

Y_p = Y .- (randn!(rng, zeros(N)) .+ 15)

Y_r = Y .- (randn!(rng, zeros(N)) .+ 3)

for i = 1:N

if Y_k[i] < 0

Y_k[i] = 20

end

if Y_p[i] < 0

Y_p[i] = 0

end

if Y_r[i] < 0

Y_r[i] = 0

end

EP = function(x)

5 - 7*x/100

end

q₃ = q_fun(θ,hcat(EP.(Y_r), EP.(Y_p), EP.(100 .- Y_k)))

q₄ = q_fun(θ,hcat(EP.(Y_r), EP.(Y_p), EP.(100 .- Y)))

tab_res = Array{Union{Missing, String, Float64}}(missing,7,5)

tab_res[1,1:end] = ["" "3rd O Pass" "3rd D Pass" "4th O Pass" "4th D Pass"]

tab_res[1:end,1] = ["" "Min." "1st Qu." "Median" "3rd Qu" "Mean" "Max."]

list = [q₃.q_o, q₃.q_d, q₄.q_o, q₄.q_d]

for i = 1:4

tab_res[2,i+1] = minimum(list[i])

tab_res[3:5,i+1] = quantile(list[i], [0.25;0.5;0.75])

tab_res[6,i+1] = mean(list[i])

tab_res[7,i+1] = maximum(list[i])

end

tab_res

Pass3 = rand!(rng, zeros(N)) .< q₃.q_o

success3 = rand!(rng, zeros(N)) .< Pass3.*(θ₃.*(1 .- q₃.q_d) + θ₄.*q₃.q_d) + (1 .- Pass3).*(θ₁.*(1 .- q₃.q_d) + θ₂.*q₃.q_d)

play = Vector{Union{Missing, String}}(missing,N)

for i = 1:N

if Pass3[i] == 1

play[i] = "pass"

else

play[i] = "run"

end

# Fourth down

Pass4 = rand!(rng, zeros(N)) .< q₄.q_o

success4 = rand!(rng, zeros(N)) .< Pass4.*(θ₃.*(1 .- q₄.q_d) + θ₄.*q₄.q_d) + (1 .- Pass4).*(θ₁.*(1 .- q₄.q_d) + θ₂.*q₄.q_d)

mean(success3)

mean(success4)

# 8.3

# 8.3.1

# 8.3.2

rng = MersenneTwister(123456789)

N = 500

a = fill(2, N)

b = 3

x = rand!(rng, zeros(N))

υ = randn!(rng, zeros(N)).*2

y = a + b.*x + υ

f_1mom_sos = function(β, y, X)

mean((y - X*β).^2)

end

f_1mom = function(β)

return f_1mom_sos(β, y, X)

end

f_2mom_sos = function(β, σ, y, X)

mean(((y - X*β).^2 .- σ^2).^2)

end

f_2mom = function(par)

σ = exp(par[1])

β = par[2:end]

return f_2mom_sos(β, σ, y, X)

end

global y = y

global X = hcat(ones(N), x)

init = [2.0; 3.0]

a = optimize(f_1mom, init)

a.minimizer

init = [log(2); 2.0; 3.0]

b = optimize(f_2mom, init)

exp(b.minimizer[1])

b.minimizer[2:end]

f_gmm_simple_sos = function(β, σ, y, X)

mean((y - X*β).^2) + mean(((y - X*β).^2 .- σ^2).^2)

end

f_gmm_simple = function(par)

σ = exp(par[1])

β = par[2:end]

return f_gmm_simple_sos(β, σ, y, X)

end

c = optimize(f_gmm_simple, init)

exp(c.minimizer[1])

c.minimizer[2:end]

# 8.3.3

# 8.3.4

gmm = function(G, K)

N = size(G)[2]

if K == size(G)[1]

# a check that the matrix G has K rows

g = []

for k = 1:K

g = [g; mean(G[k,1:end])]

end

try

W = inv((G*G')./N)

return g'*W*g

catch

println("Warning: W is the identity")

return g'g

end

else

return "ERROR: Incorrect dimension"

end

f_ols_gmm_sos = function(β, σ, y, X)

g₁ = y - X*β

g₂ = (y - X*β).^2 .- σ^2

return gmm(hcat(g₁, g₂)', 2)

end

f_ols_gmm = function(par)

σ = exp(par[1])

β = par[2:end]

return f_ols_gmm_sos(β, σ, y, X)

end

d = optimize(f_ols_gmm, init)

exp(d.minimizer[1])

d.minimizer[2:end]

# 8.3.5

path = "ENTER PATH NAME"

x = CSV.read(string(path,"/nls.csv"))

lwage76_temp = Vector{Union{Missing, Float64}}(missing,length(x.lwage76))

wage76_temp = Vector{Union{Missing, Float64}}(missing,length(x.wage76))

for i in 1:length(lwage76_temp)

if length(x.lwage76[i]) > 1

lwage76_temp[i] = parse(Float64, x.lwage76[i])

wage76_temp[i] = parse(Float64, x.wage76[i])

end

x.lwage76 = lwage76_temp

x.wage76 = wage76_temp

x1 = DataFrame(x)

x1 = dropmissing(x1)

x1.age2 = x1.age76.^2

x1.exp1 = x1.age76 - x1.ed76 - fill(6,3010)

x1.exp2 = (x1.exp1.^2)./100

f_iv_gmm_sos = function(β, y, X, Z)

g₁ = Z[1:end,1].*(y - X*β)

g₂ = Z[1:end,2].*(y - X*β)

return gmm(hcat(g₁, g₂)', 2)

end

f_iv_gmm = function(par)

return f_iv_gmm_sos(par, y, X, Z)

end

global y = x1.lwage76

global X = hcat(ones(length(y)), x1.ed76, x1.exp1, x1.exp2, x1.black, x1.reg76r, x1.smsa76r, x1.smsa66r, x1.reg662, x1.reg663, x1.reg664, x1.reg665, x1.reg666, x1.reg667, x1.reg668, x1.reg669)

global Z = hcat(x1.nearc4, x1.momdad14)

β̂ = inv(X'X)X'y

a = optimize(f_iv_gmm, β̂)

a.minimizer[2]

# 8.4

# 8.4.1

# 8.4.2

p3_sos = function(θ, Ep, s, play)

θ₁ = θ[1]

θ₂ = θ[2]

θ₃ = θ[3]

θ₄ = θ[4]

q₃ = q_fun(θ, Ep)

Pass = play .== "pass"

g₁ = Pass.*(s .- θ₃.*(1 .- q₃.q_d) - θ₄.*q₃.q_d)

g₂ = (1 .- Pass).*(s .- θ₁.*(1 .- q₃.q_d) - θ₂.*q₃.q_d)

g₃ = Pass .- q₃.q_o

G = hcat(g₁, g₂, g₃)'

return gmm(G, 3)

end

p₃ = function(par)

θ = exp.(par)./(1 .+ exp.(par))

return p3_sos(θ, Ep, s, play)

end

global Ep = hcat(EP.(Y_r), EP.(Y_p), EP.(100 .- Y_k))

global s = success3

global play = play

init = log.(2*θ)

exp.(init)./(1 .+ exp.(init))

a = optimize(p₃, init)

exp.(a.minimizer)./(1 .+ exp.(a.minimizer))

# 8.5

# 8.5.1

path = "ENTER PATH NAME"

x = DataFrame(CSV.read(string(path,"/NFLQ1.csv")))

id = 1:98213

yardline_100 = Vector{Union{Missing, Float64}}(missing, 98213)

ep = Vector{Union{Missing, Float64}}(missing, 98213)

down = Vector{Union{Missing, Float64}}(missing, 98213)

for i = 1:98213

if !(x.yardline_100[i] == "NA" || x.ep[i] == "NA" || x.down[i] == "NA")

yardline_100[i] = parse(Float64,x.yardline_100[i])

ep[i] = parse(Float64, x.ep[i])

down[i] = parse(Float64, x.down[i])

end

x.year = year.(x.game_date)

res_pos = Vector{Union{Missing, String}}(missing, 98213)

res_ep = Vector{Union{Missing, Float64}}(missing, 98213)

res_game = Vector{Union{Missing, Int64}}(missing, 98213)

res_down = Vector{Union{Missing, Float64}}(missing, 98213)

for i = 2:98213

res_pos[i-1] = x.posteam[i]

res_ep[i-1] = ep[i]

res_game[i-1] = x.game_id[i]

res_down[i-1] = down[i]

end

for i = 1:98213

if !(ismissing(res_pos[i]) || ismissing(x.posteam[i]))

if x.posteam[i] == res_pos[i]

res_ep[i] = res_ep[i]

else

res_ep[i] = -res_ep[i]

end

diff_ep = res_ep - ep

succ = Vector{Union{Missing, Int64}}(missing, 98213)

for i = 1:98213

if !(ismissing(x.game_id[i]) || ismissing(res_game[i]) || ismissing(x.posteam[i]) || ismissing(res_pos[i]) || ismissing(down[i]) || ismissing(res_down[i]))

if !(x.game_id[i] == res_game[i])

diff_ep[i] = missing

end

if res_down[i] == 1 && x.posteam[i] == res_pos[i] && down[i] == 3

succ[i] = 1

elseif res_down[i] == 4 || down[i] == 4 || down[i] == 1

succ[i] = 0

end

pct_field = yardline_100./100

x1 = DataFrame(id = id, yardline_100 = yardline_100, ep = ep, succ = succ, down = down, year = x.year, pct_field = pct_field, ydstogo = x.ydstogo, play_type = x.play_type, diff_ep = diff_ep)

x1 = dropmissing!(x1)

# Note that this is a little different from the R code because it drops missing.

# 8.5.2

lm_run = lm(@formula(diff_ep ~ ydstogo + pct_field + year), x1[(x1.play_type .== "run").*(x1.down .== 3).*(x1.succ .== 1),1:end])

lm_pass = lm(@formula(diff_ep ~ ydstogo + pct_field + year), x1[(x1.play_type .== "pass").*(x1.down .== 3).*(x1.succ .== 1),1:end])

lm_punt = lm(@formula(diff_ep ~ ydstogo + pct_field + year), x1[x1.play_type .== "punt",1:end])

X = hcat(ones(53129), x1.ydstogo, x1.pct_field, x1.year)

x1.run_ep = x1.ep + X*coef(lm_run)

x1.pass_ep = x1.ep + X*coef(lm_pass)

x1.punt_ep = x1.ep + X*coef(lm_punt)

rng = MersenneTwister(123456789)

Θ̂ = Array{Union{Missing, Float64}}(missing, 4*30, 4)

global k = 1

for i = 1:4

tg = i # togo distance

for j = 1:30

yl = 29 + j # yardline

index = (x1.down .== 3).*(x1.yardline_100 .> yl).*(x1.yardline_100 .< (yl + 20)).*(x1.ydstogo .> (tg - 2)).*(x1.ydstogo .< (tg + 2)).*((x1.play_type .== "pass") + (x1.play_type .== "run"))

global Ep = hcat(x1.run_ep[index .== 1], x1.pass_ep[index .== 1], x1.punt_ep[index .== 1])

global s = x1.succ[index .== 1]

global play = x1.play_type[index .== 1]

init = randn!(rng, zeros(4)).*2

a = optimize(p₃, init, g_tol=1e-15)

Θ̂[k,1:end] = exp.(a.minimizer)./(1 .+ exp.(a.minimizer))

global k += 1

println(k)

end

Θ̂

# 8.5.3

lm_over = lm(@formula(ep ~ pct_field + pct_field^2), x1[x1.down.==1,1:end])

x1.over_ep = -hcat(ones(53129), 1 .- x1.pct_field, (1 .- x1.pct_field).^2)*coef(lm_over)

p₄ = function(θ, Ep)

θ₁ = θ[1]

θ₂ = θ[2]

θ₃ = θ[3]

θ₄ = θ[4]

q₄ = q_fun(θ, Ep)

p_4p = q₄.q_o.*(θ₃.*(1 .- q₄.q_d) + θ₄.*q₄.q_d)

p_4r = (1 .- q₄.q_o).*(θ₁.*(1 .- q₄.q_d) + θ₂.*q₄.q_d)

p_4 = p_4p + p_4r

Epgfi = p_4r.*Ep[1:end,1] + p_4p.*Ep[1:end,2] + (1 .- p_4).*Ep[1:end,3]

return (p_4 = p_4, Epgfi = Epgfi)

end

tab_res = Array{Union{Missing, Float64}}(missing, 4, 30)

prob_actual = Array{Union{Missing, Float64}}(missing, 4, 30)

prob_pred = Array{Union{Missing, Float64}}(missing, 4, 30)

global k = 1

for i = 1:4

tg = i # togo distance

for j = 1:30

yl = 29 + j # yardline

# create subset for analysis

index3 = (x1.down .== 3).*(x1.yardline_100 .> yl).*(x1.yardline_100 .< (yl + 20)).*(x1.ydstogo .> (tg - 2)).*(x1.ydstogo .< (tg + 2)).*((x1.play_type .== "pass") + (x1.play_type .== "run"))

y = x1[index3 .== 1,1:end]

# determine predicted success

succ4 = p₄(Θ̂[k,1:end], hcat(y.run_ep, y.pass_ep, y.over_ep))

index4 = (x1.down .== 4).*(x1.yardline_100 .> yl).*(x1.yardline_100 .< (yl + 20)).*(x1.ydstogo .> (tg - 2)).*(x1.ydstogo .< (tg + 2))

z = x1[index4 .== 1,1:end]

go = Vector{Union{Missing, Float64}}(missing, size(z)[1])

for l = 1:size(z)[1]

if z.play_type[l] == "run" || z.play_type[l] == "pass"

go[l] = 1

elseif z.play_type[l] == "punt"

go[l] = 0

end

# relative value of punting

tab_res[i, j] = mean(y.punt_ep - succ4.Epgfi)

# predicted probability of going for it

prob_pred[i, j] = mean(y.punt_ep - succ4.Epgfi .< 0)

# actual probability of going for it

prob_actual[i, j] = mean(skipmissing(go))

global k += 1

println(k)

end

tab_res

tab_res2 = Array{Union{Missing, String, Float64}}(missing,7,5)

tab_res2[1,1:end] = ["" "1 To Go" "2 To Go" "3 To Go" "4 To Go"]

tab_res2[1:end,1] = ["" "Min." "1st Qu." "Median" "3rd Qu" "Mean" "Max."]

for i = 1:4

tab_res2[2,i+1] = minimum(tab_res[i,1:end])

tab_res2[3:5,i+1] = quantile(tab_res[i,1:end], [0.25;0.5;0.75])

tab_res2[6,i+1] = mean(tab_res[i,1:end])

tab_res2[7,i+1] = maximum(tab_res[i,1:end])

end

tab_res2

# 8.5.4

prob_diff = Vector{Union{Missing, Float64}}(missing, 4*30)

global k = 1

for i = 1:4

for j = 1:30

prob_diff[k] = prob_actual[i,j] - prob_pred[i,j]

global k += 1

end

histogram(prob_diff)

vline!(zeros(4*30))

xlabel!("Actual Prob - Pred Prob")