appendixA.jl
# Chapter Appendix A
#import Pkg
#Pkg.add("Plots")
#Pkg.add("Optim")
#Pkg.add("StatsKit")
#Pkg.add("GLM")
#Pkg.add("DataFrames")
#Pkg.add("StatsPlots")
#Pkg.add("StatFiles")
#Pkg.add("NLSolversBase")
#Pkg.add("Distributions")
using Random
using Statistics
using Plots
using Optim
using NLSolversBase
using StatsKit
using CSV
using GLM
using DataFrames
using LinearAlgebra
using StatsPlots
using StatFiles
using Distributions
# A.1
# A.2
# A.2.1
# A.2.2
rng = MersenneTwister(123456789)
N = 10
μ = -3
σ = 5
x = randn!(rng, zeros(N)).*σ .+ μ
dens_x = kde(x)
mean(x)
std(x)
plot(dens_x)
plot!(kde(randn!(rng, zeros(10000)).*σ .+ μ))
# A.2.3
rng = MersenneTwister(123456789)
M = 500
sample_est = zeros(M, N)
for m = 1:M
sample_est[m,1:end] = randn!(rng, zeros(N)).*σ .+ μ
end
means = zeros(M)
for m = 1:M
means[m] = mean(sample_est[m,1:end])
end
histogram(means)
vline!(fill(-3,M))
# A.2.4
rng = MersenneTwister(123456789)
N = 100000
x = randn!(rng, zeros(N)).*σ .+ μ
mean(x)
# A.2.5
rng = MersenneTwister(123456789)
M = 1000
sample_means = zeros(M,4)
for m = 1:M
sample_means[m,1] = mean(randn!(rng, zeros(10)).*σ .+ μ)
sample_means[m,2] = mean(randn!(rng, zeros(100)).*σ .+ μ)
sample_means[m,3] = mean(randn!(rng, zeros(1000)).*σ .+ μ)
sample_means[m,4] = mean(randn!(rng, zeros(10000)).*σ .+ μ)
end
plot(kde((sample_means[1:end,1] .- μ).*(√9/σ)))
plot!(kde((sample_means[1:end,2] .- μ).*(√99/σ)))
plot!(kde((sample_means[1:end,3] .- μ).*(√999/σ)))
plot!(kde((sample_means[1:end,4] .- μ).*(√9999/σ)))
# Note here I have adjusted for degrees of freedom.
# A.2.6
rng = MersenneTwister(123456789)
M = 5
N = 10
sample_means = zeros(M,2)
for m = 1:M
x₁ = randn!(rng, zeros(N)).*σ .+ μ
sample_means[m,1:end] = [mean(x₁); std(x₁)/(√(N-1))]
end
plot(kde((randn!(rng, zeros(1000)) .+ sample_means[1,1]).*sample_means[1,2]))
plot!(kde((randn!(rng, zeros(1000)) .+ sample_means[2,1]).*sample_means[2,2]))
plot!(kde((randn!(rng, zeros(1000)) .+ sample_means[3,1]).*sample_means[3,2]))
plot!(kde((randn!(rng, zeros(1000)) .+ sample_means[4,1]).*sample_means[4,2]))
plot!(kde((randn!(rng, zeros(1000)) .+ sample_means[5,1]).*sample_means[5,2]))
# A.2.8
rng = MersenneTwister(123456789)
Random.seed!(123456789)
N = 10
x₁ = randn!(rng, zeros(N)).*σ .+ μ
M = 1000
bs_mean = zeros(M)
for m = 1:M
bs_mean[m] = mean(rand(x₁, N))
end
plot(kde(bs_mean))
plot!(kde(randn!(rng, zeros(100000)).*(σ/√(N-1)) .+ μ))
# A.2.9
rng = MersenneTwister(123456789)
α = 0.05
cr = std(x).*quantile(randn!(rng, zeros(10000)), α)./√9
# Again adjusting for degrees of freedom.
plot(kde(randn!(rng, zeros(10000)).*(std(x)/√9)))
vline!(fill(cr,10))
vline!(fill(mean(x),10))
mean(randn!(rng, zeros(10000)) .< √9*mean(x)/std(x))
# A.3
# A.3.1
# A.3.2
# A.3.3
logϕ = function(z)
-z.^2 - √π
end
log_norm = function(x, μ, σ)
z = (x .- μ)./σ
sum(logϕ.(z) .- log(σ))
end
Θ = ((1:100).*15)./100 .- 10
g = (1:100)./100
h = []
for i = 1:100
h = [h; log_norm(x₁, Θ[i], 5)]
end
γ = exp.(h .+ log.(g))./sum(exp.(h .+ log.(g)))
plot(Θ, γ)
vline!(fill(-3,100))
vline!(fill(mean(x₁), 100))
# A.4
# A.4.1
# A.4.2
# A.4.3
Θ = (maximum(sample_est) - minimum(sample_est)).*((1:1000)./1000) .+ minimum(sample_est)
H = zeros(500,1000)
for i = 1:500
x = sample_est[i,1:end]
for j = 1:1000
H[i,j] = log_norm(x, Θ[j], 5)
end
end
g₀ = fill(1/1000, 1000)
log_G0 = log.(zeros(500, 1000) .+ 1/1000)
γ₀ = exp.(H + log_G0)
for i = 1:500
γ₀[i,1:end] = γ₀[i,1:end]./sum(γ₀[i,1:end])
end
g₁ = []
for i = 1:1000
g₁ = [g₁; sum(γ₀[1:end,i])]
end
g₁ = g₁./sum(g₁)
log_G1 = log.(g₁)
for i = 2:500
log_G1 = hcat(log_G1, log.(g₁))
end
γ₁ = exp.(H + log_G1')
for i = 1:500
γ₁[i,1:end] = γ₁[i,1:end]./sum(γ₁[i,1:end])
end
g₂ = []
for i = 1:1000
g₂ = [g₂; sum(γ₁[1:end,i])]
end
g₂ = g₂./sum(g₂)
plot(Θ, g₀)
plot!(Θ, g₁)
plot!(Θ, g₂)
ϵ = 1e-3 # small number
g_old = g₁
g_new = g₂
diff = sum(abs.(g_old - g_new))
while diff > ϵ
g_old = g_new
log_G = log.(g_old)
for i = 2:500
log_G = hcat(log_G, log.(g_old))
end
γ = exp.(H + log_G')
for i = 1:500
γ[i,1:end] = γ[i,1:end]./sum(γ[i,1:end])
end
g_new = []
for i = 1:1000
g_new = [g_new; sum(γ[1:end,i])]
end
g_new = g_new./sum(g_new)
diff = sum(abs.(g_old - g_new))
println(diff)
end
μ_est = []
for i = 1:500
μ_est = [μ_est; mean(sample_est[i])]
end
J = 100000
g_dist = []
for i = 1:1000
if round(J*g_new[i]) > 0
g_dist = [g_dist; fill(Θ[i], Int(round(J*g_new[i])))]
end
end
plot(density(g_dist))
vline!(fill(-3, 100))
plot!(density!(μ_est))
# A.4.4
h_x1 = []
for i = 1:1000
h_x1 = [h_x1; log_norm(x₁, Θ[i], 5)]
end
γ_x1 = (exp.(h_x1) + g_new)./sum(exp.(h_x1) + g_new)
J = 100000
g_dist_x1 = []
for i = 1:1000
g_dist_x1 = [g_dist_x1; fill(Θ[i], Int(round(J*γ_x1[i])))]
end
plot(density(g_dist_x1))
vline!(fill(-3, 100))
vline!(fill(mean(x₁), 100))
# A.5
# A.5.1
# A.5.2
log_bin = function(N, p̂, p)
K = Int(round(p̂*N))
return log(binomial(big(N), big(K))) + K*log(p) + (N-K)*log(1 - p)
end
Θ = (1:999)./1000
log_g = fill(log(1/1000), 999)
h_jp = []
for i = 1:999
h_jp = [h_jp; log_bin(3, 1, Θ[i])]
end
γ_jp = exp.(h_jp + log_g)./sum(exp.(h_jp + log_g))
J = 100000
γ_dist_jp = []
for i = 1:999
γ_dist_jp = [γ_dist_jp; fill(i, Int(round(J*γ_jp[i])))]
end
# mean
γ_jp'*Θ
plot(density(γ_dist_jp))
vline!(fill(342,100))
xlabel!("batting average")
sum(γ_jp[342:999])
# A.5.3
M = 1000
path = "ENTER PATH HERE"
x = CSV.read(string(path,"/bat_ave.csv"))
# data set created from Lahman's data
H = zeros(M, 999)
for m = 1:M
for i = 1:999
H[m,i] = log_bin(x.AB[m], x.AVE[m], Θ[i])
end
#println(m)
end
log_G = fill(log(1/1000), M, 999)
γ₀ = exp.(H + log_G)
for m = 1:M
γ₀[m,1:end] = γ₀[m,1:end]./sum(γ₀[m,1:end])
end
g₁ = []
for i = 1:999
g₁ = [g₁; sum(γ₀[1:end,i])]
end
g₁ = g₁/sum(g₁)
ϵ = 0.001
g_old = exp.(log_g)
g_new = g₁
diff = sum(abs.(g_new - g_old))
while diff > ϵ
g_old = g_new
log_G = log.(g_old)
for i = 2:M
log_G = hcat(log_G, log.(g_old))
end
Γ = exp.(H + log_G')
for m = 1:M
Γ[m,1:end] = Γ[m,1:end]./sum(Γ[m, 1:end])
end
g_new = []
for i = 1:999
g_new = [g_new; sum(Γ[1:end,i])]
end
g_new = g_new./sum(g_new)
diff = sum(abs.(g_old - g_new))
println(diff)
end
# A.5.4
γ_jp2 = exp.(h_jp + log.(g_new))
γ_jp2 = γ_jp2./sum(γ_jp2)
J = 100000
g_dist_mlb = []
for i = 1:999
g_dist_mlb = [g_dist_mlb; fill(Θ[i], Int(round(J*g_new[i])))]
end
γ_dist_jp2 = []
for i = 1:999
γ_dist_jp2 = [γ_dist_jp2; fill(Θ[i], Int(round(J*γ_jp2[i])))]
end
# mean
γ_jp2'*Θ
sum(γ_jp2[342:999])
plot(density(g_dist_mlb), label="Prior")
plot!(density!(γ_dist_jp2), label="JPs Posterior")
vline!(fill(0.342, 100))
xlabel!("batting average")
# A.6
# A.6.1
# A.6.2
# A.6.3
# A.6.4
# A.7