Contact me at the buttons in the footer!
https://github.com/zetvzb/SABR/blob/master/Webscrape%20attendance%20from%20Baseball%20Reference
#Creating a model for analyzing draft
library(dplyr)
library(sqldf)
library(rvest)
pages = 2000 #Number of webpages to scrape. Hence there are 15 Years we want. Pages will be the oldest year we want
sci_df <- data.frame()
#You can write an additional function in this heirarchy to loop through all 30+ MLB Teams. Instead, I manually inserted the team Acronymn, for example KCR into the URL below for each team. I did this to verify that the process worked correctly.
for (page in (2018:pages)) {
print(paste0("getting data for page: ", page))
URL <- paste0("https://www.baseball-reference.com/teams/KCR/", page,"-schedule-scores.shtml")
webpage <- read_html(URL)
page_list <- webpage %>%
html_nodes('td') %>%
html_text() %>%
gsub("^\\s+|\\s+$", "", .) #strip the white space from the beginning and end of a string.
Gm <- webpage %>%
html_nodes('th.right') %>%
html_text() %>%
as.data.frame()
Date <- webpage %>%
html_nodes('th.right+ .left') %>%
html_text() %>%
as.data.frame()
Boxscore <- webpage %>%
html_nodes('.left:nth-child(3)') %>%
html_text() %>%
as.data.frame()
Tm <- webpage %>%
html_nodes('.left:nth-child(4)') %>%
html_text() %>%
as.data.frame()
Locator <- webpage %>%
html_nodes('#team_schedule td:nth-child(5)') %>%
html_text() %>%
as.data.frame()
Opp <- webpage %>%
html_nodes('.left:nth-child(6)') %>%
html_text() %>%
as.data.frame()
W_L <-webpage %>%
html_nodes('.left:nth-child(7)') %>%
html_text() %>%
as.data.frame()
R <- webpage %>%
html_nodes('.right:nth-child(8)') %>%
html_text() %>%
as.data.frame()
RA <- webpage %>%
html_nodes('.right:nth-child(9)') %>%
html_text() %>%
as.data.frame()
Inn <- webpage %>%
html_nodes('.right:nth-child(10)') %>%
html_text() %>%
as.data.frame()
WL_Record <-webpage %>%
html_nodes('.right:nth-child(11)') %>%
html_text() %>%
as.data.frame()
Rank <- webpage %>%
html_nodes('.right:nth-child(12)') %>%
html_text() %>%
as.data.frame()
GB <- webpage %>%
html_nodes('.right:nth-child(13)') %>%
html_text() %>%
as.data.frame()
Win <- webpage %>%
html_nodes('.left:nth-child(14)') %>%
html_text() %>%
as.data.frame()
Loss <- webpage %>%
html_nodes('.left:nth-child(15)') %>%
html_text() %>%
as.data.frame()
Save <- webpage %>%
html_nodes('.left:nth-child(16)') %>%
html_text() %>%
as.data.frame()
Time <- webpage %>%
html_nodes('.right:nth-child(17)') %>%
html_text() %>%
as.data.frame()
D_N <-webpage %>%
html_nodes('.right:nth-child(18)') %>%
html_text() %>%
as.data.frame()
Attendance <- webpage %>%
html_nodes('.right:nth-child(19)') %>%
html_text() %>%
as.data.frame()
Streak <- webpage %>%
html_nodes('.left:nth-child(20)') %>%
html_text() %>%
as.data.frame()
tmp <- data.frame(Gm, Date, Boxscore,Tm, Locator,Opp, W_L, R, RA, Inn, WL_Record, Rank, GB, Win, Loss, Save, Time, D_N, Attendance, Streak)
if(pages ==2018) {
sci_df <- data.frame(tmp)
} else {
sci_df <-rbind(sci_df, tmp)
}
}
names(sci_df) <- c("Game", "Date", "Boxscore", "Team", "Locator", "Opp", "W_L", "R", "RA", "Inn", "WL Record", "Rank", "GB", "Win", "Loss", "Save", "Time", "D_N", "Attendance", "Streak")
#Need to add a Year Field for every year with bump
sci_df$Year = ""
sci_df[c(1:162), 21] = "2018"
sci_df[c(163:324), 21] ="2017"
sci_df[c(325:486), 21] ="2016"
sci_df[c(487:648), 21] ="2015"
sci_df[c(659:810), 21] ="2014"
sci_df[c(811:972), 21] ="2013"
sci_df[c(973:1134),21] ="2012"
sci_df[c(1135:1296), 21] = "2011"
sci_df[c(1297:1458), 21] = "2010"
sci_df[c(1459:1620),21] = "2009"
sci_df[c(1621:1782),21] = "2008"
sci_df[c(1783:1944), 21] ="2007"
sci_df[c(1945:2106), 21] = "2006"
sci_df[c(2107:2268),21] = "2005"
sci_df[c(2269:2430), 21] = "2004"
sci_df[c(2431:2592), 21] = "2003"
sci_df[c(2593:2754), 21] = "2002"
sci_df[c(2755:2916),21] = "2001"
sci_df[c(2917:3078), 21] = "2000"
check = sci_df[,c(1,2,21)]
write.csv(sci_df, "Royals.csv")
#Merge csv files from All Teams into one dataframe (Assure only csv team files are in the folder and not your R-Script.
filenames <- list.files(full.names=TRUE)
Merge <- lapply(filenames, function(i){
read.csv(i, header=TRUE)
})
df <- do.call(rbind.data.frame, Merge)
write.csv(df, "Scrubbed_Attendance.csv")