x <- read.csv("wcer.csv", as.is = TRUE) # Sales data
w <- read.csv("upccer.csv", as.is = TRUE) # UPC level data
z <- read.csv("cereal.csv", as.is = TRUE) # Characteristics data
require(foreign)
x2 <- read.dta("demo.dta") # demographic data at store level
x3 <- x2[,colnames(x2) %in% c("store","age9","zone","income","hhlarge"
,"workwom","poverty","unemp")]
# %in% finds the index for terms in the first set that are also in the
# second set.
w$name2 <-
sub('[. | ]*',"",
sub("GM","",
sub("cere.*","",
sub("kellogg","",
sub("kelloggs","",
sub("nab","",
sub("nabisco","",
sub("post","",
sub("[~|'|-| ]","",
tolower(sub(" ","",w$DESCRIP)))))))))))
# sub replaces the first term with the second in a string.
# it accepts real expressions.
# tolower makes the string lower case.
w[w$name2=="ricekrispi",]$name2 <- "ricekrispies"
w[w$name2=="applecinnamoncherr",]$name2 <- "applecinnamoncheer"
z$name2 <- substr(sub("[']","",
sub(" ","",
sub("[^a-z]","",tolower(z$name)))),1,18)
# substr the subset of the string from the first number to the last.
w1 <- merge(w,z,by=c("name2"))
w1$oz <- as.numeric(sub("[a-z].*","",tolower(w1$SIZE)))
w2 <- w1[,colnames(w1) %in% c("UPC","mfr","calories","protein",
"fat","sodium","fiber","carbo",
"vitamins","rating","oz")]
y <- x[x$UPC %in% unique(w1$UPC) & x$WEEK > 380,]
# limit the size of the data set.
ps <- qs <- matrix(NA,length(unique(y$STORE))*length(unique(y$WEEK)),
length(unique(y$UPC))+2)
colnames(ps) <- colnames(qs) <-
c("week","store",as.character(sort(unique(y$UPC))))
a <- 1 # a will be used to put things in the right place.
for (t in 1:length(unique(y$WEEK))) {
week <- sort(unique(y$WEEK))[t]
for (j in 1:length(unique(y$STORE))) {
store <- sort(unique(y$STORE))[j]
qs[a,1:2] <- c(week,store)
if (length((y[y$STORE==store & y$WEEK==week,]$MOVE > 0)) > 0) {
qs[a,colnames(qs) %in%
as.character(y[y$STORE==store & y$WEEK==week,]$UPC)] <-
y[y$STORE==store &
y$WEEK==week,]$MOVE/sum(y[y$STORE==store &
y$WEEK==week,]$MOVE,na.rm = TRUE)
}
else {
qs[a,colnames(qs) %in%
as.character(y[y$STORE==store & y$WEEK==week,]$UPC)] <- 0
}
ps[a,colnames(ps) %in%
as.character(y[y$STORE==store & y$WEEK==week,]$UPC)] <-
y[y$STORE==store &
y$WEEK==week,]$PRICE/y[y$STORE==store & y$WEEK==week,]$QTY
a <- a + 1 # update a
#print(a)
}
}
# Leave out the UPC with largest average share.
upc0 <- as.numeric(names(which.max(colMeans(qs[,3:dim(qs)[2]], na.rm = TRUE))))
col0 <- 2 + which.max(colMeans(qs[,3:dim(qs)[2]], na.rm = TRUE))
w3 <- w2
w3[,-c(1:2)] <- as.matrix(w2[,-c(1:2)]) -
t(matrix(unlist(rep(w2[w2$UPC==upc0,-c(1:2)],dim(w2)[1])),
nrow=(dim(w2)[2]-2)))
eps <- 1e-4 # a small number (eps = epsilon)
sig <- log(qs[,-c(1:2,col0)]+eps) -
matrix(rep(log(qs[,col0]+eps),dim(qs)[2]-3), nrow = dim(qs)[1])
ps2 <- ps
for (i in 1:(dim(ps)[2]-2)) {
upc <- as.numeric(colnames(ps)[2+i])
ps2[,2+i] <- ps[,2+i]/w2[w2$UPC==upc,]$oz
}
ps3 <- ps2[,-c(1:2,col0)] -
matrix(rep(ps2[,col0], dim(qs)[2]-3), nrow = dim(ps2)[1])
ps4 <- ps[,-c(1:2,col0)] -
matrix(rep(ps[,col0], dim(qs)[2]-3), nrow = dim(ps)[1])
upcs <- t(matrix(rep(colnames(qs)[-c(1:2,col0)],dim(qs)[1]),
nrow = dim(qs)[2]-3))
weeks <- matrix(rep(qs[,1],dim(qs)[2]-3), nrow = dim(qs)[1])
stores <- matrix(rep(qs[,2],dim(qs)[2]-3), nrow = dim(qs)[1])
X <- cbind(as.vector(upcs),as.vector(weeks),as.vector(stores),
as.vector(ps3),as.vector(ps4),as.vector(sig))
colnames(X) <- c("UPC","WEEK","store","ozprice","price","sig")
X1 <- merge(X,w3,by=c("UPC"))
X2 <- merge(X1,x3,by=c("store"))
X2$ozprice <- as.numeric(as.character(X2$ozprice))
X2$price <- as.numeric(as.character(X2$price))
X2$sig <- as.numeric(as.character(X2$sig))
X2$store <- as.numeric(as.character(X2$store))
X2$UPC <- as.numeric(as.character(X2$UPC))
X2$WEEK <- as.numeric(as.character(X2$WEEK))
# Note in the merger above, R change the type of the variable to a "factor".
X2$quaker <- ifelse(X2$mfr=="Q",1,0)
X2$post <- ifelse(X2$mfr=="P",1,0)
X2$kellogg <- ifelse(X2$mfr=="K",1,0)
Y3 <- X2$ozprice
X3 <- X2[,colnames(X2) %in% c("sig","fat","carbo","sodium","fiber",
"oz","quaker","post","kellogg","age9",
"hhlarge")]
write.csv(X2,"dominicks.csv")
# Note that the analysis uses total revenue:
sum(x$MOVE*x$PRICE/x$QTY)