chapter7.jl
# Chapter 7
#import Pkg
#Pkg.add("Plots")
#Pkg.add("Optim")
#Pkg.add("StatsKit")
#Pkg.add("GLM")
#Pkg.add("DataFrames")
#Pkg.add("StatsPlots")
#Pkg.add("StatFiles")
#Pkg.add("NLSolversBase")
#Pkg.add("Distributions")
using Random
using Statistics
using Plots
using Optim
using NLSolversBase
using StatsKit
using CSV
using GLM
using DataFrames
using LinearAlgebra
using StatsPlots
using StatFiles
using Distributions
# 7.1
# 7.2
# 7.2.1
# 7.2.2
# 7.2.3
# 7.3
# 7.3.1
rng = MersenneTwister(123456789)
N = 1000 # number of markets
β = 0.25 # demand parameter
v = 3 .+ randn!(rng, zeros(N)) # quality (vertical) measure
c_L = 3 .+ randn!(rng, zeros(N))
c_R = 9 .+ randn!(rng, zeros(N))
# costs for both firms
p_L = (v .+ 1 .+ β.*c_R + 2*β.*c_L)./(3*β)
p_R = (-v .- 1 .+ β.*c_L + 2*β.*c_R)./(3*β)
# price function for each firm
x_L = (v .+ 1 .- β.*p_L .+ β.*p_R)./2
index = (p_L .> 0).*(p_R .> 0).*(x_L .> 0).*(x_L .< 1)
c_L1 = c_L[index]
c_R1 = c_R[index]
p_L1 = p_L[index]
p_R1 = p_R[index]
x_L1 = x_L[index]
v1 = v[index]
# adjusting values to make things nice
scatter(x_L1, p_L1 - p_R1)
# 7.3.3
# Intent-to-treat
data1 = DataFrame(x = x_L1, p = p_L1 - p_R1, c = c_L1 - c_R1)
lm1 = lm(@formula(x ~ c), data1)
# First stage
lm2 = lm(@formula(p ~ c), data1)
# IV
coef(lm1)[2]/coef(lm2)[2]
lm_iv = function (y, X; Z=X, Reps=100, min1=0.05, max1=0.95)
# Set up
Random.seed!(123456789)
X = [ones(length(y)) X]
Z = [ones(length(y)) Z]
# Bootstrap
bs_mat = Array{Union{Missing, Float64}}(missing, Reps, size(X)[2])
for r = 1:Reps
index_r = rand(1:length(y), length(y))
y_bs = y[index_r]
X_bs = X[index_r,1:end]
Z_bs = Z[index_r,1:end]
bs_mat[r,1:end] = inv(Z_bs'X_bs)*Z_bs'y_bs
#println(r)
end
# Present Results
tab_res = Array{Union{Missing,Float64,String}}(missing,1+size(X)[2],5)
tab_res[1,2:end] = ["Mean" "SD" string(min1) string(max1)]
tab_res[2,1] = "intercept"
for i in 1:size(X)[2]
tab_res[i+1,2] = mean(bs_mat[1:end,i])
tab_res[i+1,3] = std(bs_mat[1:end,i])
tab_res[i+1,4:5] = quantile(bs_mat[1:end,i],[min1,max1])
end
return tab_res
end
y = x_L1
X = hcat(p_L1, p_R1)
Z = hcat(c_L1, c_R1)
tab_ols = lm_iv(y, X)
tab_iv = lm_iv(y, X, Z=Z)
# 7.3.4
lm1 = lm(@formula(p ~ v), DataFrame(p=p_L1, v=v1))
b̂ = coef(lm1)[2]
β̂ = 1/(3*b̂)
-β̂/2
# 7.4
# 7.4.1
# 7.4.2
# 7.4.3
# 7.4.4
# 7.4.5
# 7.4.6
# 7.5
# 7.5.1
path = "/Volumes/My WD/Book"
x = DataFrame(CSV.read(string(path,"/dominicks.csv")))
p = Vector{Union{Missing, Float64}}(missing,88160)
sig = Vector{Union{Missing, Float64}}(missing, 88160)
oz = Vector{Union{Missing, Float64}}(missing,88160)
age9 = Vector{Union{Missing, Float64}}(missing, 88160)
hhlarge = Vector{Union{Missing, Float64}}(missing, 88160)
income = Vector{Union{Missing, Float64}}(missing, 88160)
share = Vector{Union{Missing, Float64}}(missing, 88160)
for i = 1:88160
if x.ozprice[i] == "NA" || x.oz[i] == "NA" || x.sig[i] == "NA" || x.age9[i] == "NA" || x.hhlarge[i] == "NA"
p[i] = missing
else
p[i] = parse(Float64, x.ozprice[i])
oz[i] = parse(Float64, x.oz[i])/100
sig[i] = parse(Float64, x.sig[i])
age9[i] = parse(Float64, x.age9[i])
hhlarge[i] = parse(Float64, x.hhlarge[i])
income[i] = parse(Float64, x.income[i])
share[i] = parse(Float64, x.share[i])
end
#println(i)
end
fat = x.fat./100
sodium = x.sodium./1000
carbo = x.carbo./100
x1 = DataFrame(sig=sig, income=income, p=p, fat=fat, carbo=carbo, sodium=sodium, fiber=x.fiber, oz=oz, quaker=x.quaker, post=x.post, kellogg=x.kellogg, age9=age9, hhlarge=hhlarge, store=x.store, WEEK=x.WEEK, UPC=x.UPC, share=share)
x1 = dropmissing!(x1)
# note that this is different from the R code.
W = hcat(x1.sig, x1.fat, x1.sodium, x1.fiber, x1.carbo, x1.oz, x1.quaker, x1.post, x1.kellogg)
# 7.5.2
p = x1.p
Z = hcat(x1.income, x1.fat, x1.sodium, x1.fiber, x1.carbo, x1.oz, x1.quaker, x1.post, x1.kellogg)
tab_ols = lm_iv(p, W, Reps=300)
tab_iv = lm_iv(p, W, Z=Z, Reps=300)
# Note that these results are different from the R code
# In particular there seems to be a multicollinearity problem
# with the store level measures - income, age9 and hhlarge.
# age9 and hhlarge are not used here.
# 7.5.3
β̂ = -1/(2*tab_iv[3,2])
γ̂ = -tab_iv[2:end,2]./β̂
γ̂[2] = β̂
stores = unique(x1.store)
weeks = unique(x1.WEEK)
UPCs = unique(x1.UPC)
T_s = length(stores)
T_w = length(weeks)
J = length(UPCs)
exp_δ = Array{Union{Missing, Float64}}(missing, T_s*T_w, J)
global t = 1
for t_s = 1:T_s
store = stores[t_s]
for t_w = 1:T_w
week = weeks[t_w]
upc_temp = x1.UPC[(x1.WEEK .== week).*(x1.store .== store)]
index_upc = []
for j = 1:J
if UPCs[j] ∈ upc_temp
index_upc = [index_upc; j]
end
end
W_temp = W[(x1.WEEK .== week).*(x1.store .== store),1:end]
exp_δ[t,index_upc] = exp.(hcat(ones(size(W_temp)[1]), W_temp)*γ̂)
global t += 1
println(t)
end
end
share_est = Array{Union{Missing,Float64}}(missing,1387,57)
for i = 1:1387
sum_exp_δ = sum(skipmissing(exp_δ[i,1:end]))
for j = 1:57
if !ismissing(exp_δ[i,j])
share_est[i,j] = exp_δ[i,j]/sum_exp_δ
end
end
end
share_est_ave = Vector{Union{Missing, Float64}}(missing, 57)
for j = 1:57
share_est_ave[j] = mean(skipmissing(share_est[1:end,j]))
end
share_est_ave
histogram(share_est_ave)
upc_acc = 1600062760
# Family size Apple cinnamon cheerios
share_acc = mean(x1.share[x1.UPC .== upc_acc])
j_acc = UPCs .== upc_acc
K = 20
min_k = -7
max_k = -5
global diff_k = (max_k - min_k)/K
acc_demand = Array{Union{Missing, Float64}}(missing, K, 2)
global min_t = min_k
for k = 1:K
pr = min_t + diff_k
exp_δ2 = exp_δ
index_acc = x1.UPC .== upc_acc
N = sum(index_acc)
W_temp = hcat(ones(N), fill(pr, N), W[x1.UPC .== upc_acc,2:end])
exp_δ2[1:end,j_acc] = exp.(W_temp*γ̂)
ave_share = Array{Union{Missing, Float64}}(missing, length(UPCs), 2)
for i = 1:length(UPCs)
UPC = UPCs[i]
ave_share[i,1] = UPC
ave_share[i,2] = mean(skipmissing(exp_δ2[1:end,i]))
println(i)
end
ave_share[1:end,2] = ave_share[1:end,2]./(1 + sum(skipmissing(ave_share[1:end,2])))
acc_demand[k,1] = pr
acc_demand[k,2] = ave_share[j_acc,2][1]
global min_t = min_t + diff_k
println(k)
end
scatter(acc_demand[1:end,2],acc_demand[1:end,1])
hline!(fill(-6.1,K))
# 7.5.4
(0.5*(-5.2-acc_demand[9,1]))*acc_demand[9,2]*267888011*(52/367)
# Note that this is different from results with the R code.
# The data sets are somewhat different and the estimation procedure also differs.