dominicks.R

x <- read.csv("wcer.csv", as.is = TRUE) # Sales data

w <- read.csv("upccer.csv", as.is = TRUE) # UPC level data

z <- read.csv("cereal.csv", as.is = TRUE) # Characteristics data

require(foreign)

x2 <- read.dta("demo.dta") # demographic data at store level

x3 <- x2[,colnames(x2) %in% c("store","age9","zone","income","hhlarge"

,"workwom","poverty","unemp")]

# %in% finds the index for terms in the first set that are also in the

# second set.

w$name2 <-

sub('[. | ]*',"",

sub("GM","",

sub("cere.*","",

sub("kellogg","",

sub("kelloggs","",

sub("nab","",

sub("nabisco","",

sub("post","",

sub("[~|'|-| ]","",

tolower(sub(" ","",w$DESCRIP)))))))))))

# sub replaces the first term with the second in a string.

# it accepts real expressions.

# tolower makes the string lower case.

w[w$name2=="ricekrispi",]$name2 <- "ricekrispies"

w[w$name2=="applecinnamoncherr",]$name2 <- "applecinnamoncheer"

z$name2 <- substr(sub("[']","",

sub(" ","",

sub("[^a-z]","",tolower(z$name)))),1,18)

# substr the subset of the string from the first number to the last.

w1 <- merge(w,z,by=c("name2"))

w1$oz <- as.numeric(sub("[a-z].*","",tolower(w1$SIZE)))

w2 <- w1[,colnames(w1) %in% c("UPC","mfr","calories","protein",

"fat","sodium","fiber","carbo",

"vitamins","rating","oz")]

y <- x[x$UPC %in% unique(w1$UPC) & x$WEEK > 380,]

# limit the size of the data set.

ps <- qs <- matrix(NA,length(unique(y$STORE))*length(unique(y$WEEK)),

length(unique(y$UPC))+2)

colnames(ps) <- colnames(qs) <-

c("week","store",as.character(sort(unique(y$UPC))))

a <- 1 # a will be used to put things in the right place.

for (t in 1:length(unique(y$WEEK))) {

week <- sort(unique(y$WEEK))[t]

for (j in 1:length(unique(y$STORE))) {

store <- sort(unique(y$STORE))[j]

qs[a,1:2] <- c(week,store)

if (length((y[y$STORE==store & y$WEEK==week,]$MOVE > 0)) > 0) {

qs[a,colnames(qs) %in%

as.character(y[y$STORE==store & y$WEEK==week,]$UPC)] <-

y[y$STORE==store &

y$WEEK==week,]$MOVE/sum(y[y$STORE==store &

y$WEEK==week,]$MOVE,na.rm = TRUE)

}

else {

qs[a,colnames(qs) %in%

as.character(y[y$STORE==store & y$WEEK==week,]$UPC)] <- 0

}

ps[a,colnames(ps) %in%

as.character(y[y$STORE==store & y$WEEK==week,]$UPC)] <-

y[y$STORE==store &

y$WEEK==week,]$PRICE/y[y$STORE==store & y$WEEK==week,]$QTY

a <- a + 1 # update a

#print(a)

}

# Leave out the UPC with largest average share.

upc0 <- as.numeric(names(which.max(colMeans(qs[,3:dim(qs)[2]], na.rm = TRUE))))

col0 <- 2 + which.max(colMeans(qs[,3:dim(qs)[2]], na.rm = TRUE))

w3 <- w2

w3[,-c(1:2)] <- as.matrix(w2[,-c(1:2)]) -

t(matrix(unlist(rep(w2[w2$UPC==upc0,-c(1:2)],dim(w2)[1])),

nrow=(dim(w2)[2]-2)))

eps <- 1e-4 # a small number (eps = epsilon)

sig <- log(qs[,-c(1:2,col0)]+eps) -

matrix(rep(log(qs[,col0]+eps),dim(qs)[2]-3), nrow = dim(qs)[1])

ps2 <- ps

for (i in 1:(dim(ps)[2]-2)) {

upc <- as.numeric(colnames(ps)[2+i])

ps2[,2+i] <- ps[,2+i]/w2[w2$UPC==upc,]$oz

}

ps3 <- ps2[,-c(1:2,col0)] -

matrix(rep(ps2[,col0], dim(qs)[2]-3), nrow = dim(ps2)[1])

ps4 <- ps[,-c(1:2,col0)] -

matrix(rep(ps[,col0], dim(qs)[2]-3), nrow = dim(ps)[1])

upcs <- t(matrix(rep(colnames(qs)[-c(1:2,col0)],dim(qs)[1]),

nrow = dim(qs)[2]-3))

weeks <- matrix(rep(qs[,1],dim(qs)[2]-3), nrow = dim(qs)[1])

stores <- matrix(rep(qs[,2],dim(qs)[2]-3), nrow = dim(qs)[1])

X <- cbind(as.vector(upcs),as.vector(weeks),as.vector(stores),

as.vector(ps3),as.vector(ps4),as.vector(sig))

colnames(X) <- c("UPC","WEEK","store","ozprice","price","sig")

X1 <- merge(X,w3,by=c("UPC"))

X2 <- merge(X1,x3,by=c("store"))

X2$ozprice <- as.numeric(as.character(X2$ozprice))

X2$price <- as.numeric(as.character(X2$price))

X2$sig <- as.numeric(as.character(X2$sig))

X2$store <- as.numeric(as.character(X2$store))

X2$UPC <- as.numeric(as.character(X2$UPC))

X2$WEEK <- as.numeric(as.character(X2$WEEK))

# Note in the merger above, R change the type of the variable to a "factor".

X2$quaker <- ifelse(X2$mfr=="Q",1,0)

X2$post <- ifelse(X2$mfr=="P",1,0)

X2$kellogg <- ifelse(X2$mfr=="K",1,0)

Y3 <- X2$ozprice

X3 <- X2[,colnames(X2) %in% c("sig","fat","carbo","sodium","fiber",

"oz","quaker","post","kellogg","age9",

"hhlarge")]

write.csv(X2,"dominicks.csv")

# Note that the analysis uses total revenue:

sum(x$MOVE*x$PRICE/x$QTY)

Google Sites

Report abuse