Given the size of the data, I had to use RStudio for the cleaning.
##Libraries needed
library(tidyverse)
library(lubridate)
library(data.table)
library(janitor)
##I imported all csv files individually then combined them all into one data.frame while deleting separated tables to save room. I have realized since then that there
is an easier way to do this.
novdec <- merge(x=`202111`,y=`202112`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`202111`, `202112`)
janfeb <- merge(x=`202201`,y=`202202`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`202201`, `202202`)
marapr <- merge(x=`202203`,y=`202204`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`202203`, `202204`)
mayjun <- merge(x=`202205`,y=`202206`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`202205`, `202206`)
julaug <- merge(x=`202207`,y=`202208`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`202207`, `202208`)
sepoct <- merge(x=`202209`,y=`202210`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`202209`, `202210`)
janapr <- merge(x=`janfeb`,y=`marapr`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`janfeb`, `marapr`)
janjun <- merge(x=`janapr`,y=`mayjun`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`janapr`, `mayjun`)
janaug <- merge(x=`janjun`,y=`julaug`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`janjun`, `julaug`)
jandec <- merge(x=`janaug`,y=`novdec`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`janaug`, `novdec`)
ride_data <- merge(x=`jandec`,y=`sepoct`,by=c("ride_id", "rideable_type","started_at", "ended_at", "start_station_name","start_station_id", "end_station_name", "end_station_id", "start_lat", "start_lng", "end_lat","end_lng", "member_casual"), all=TRUE)
rm(`jandec`, `sepoct`)
##I then deleted any entry with incomplete data. This only did some for some reason.
ride_data_omit <- ride_data %>% drop_na()
##deleted ride_data original to make room and for it to come back later on
rm(ride_data)
##Looked at data properties
tibble(ride_data_omit)
##Changed start and end times to dates rather than chr
ride_data_omit$started_at <- ymd_hms(ride_data_omit$started_at)
ride_data_omit$ended_at <- ymd_hms(ride_data_omit$ended_at)
##Add ride_length column
ride_data_omit$ride_length <- difftime(ride_data_omit$ended_at, ride_data_omit$started_at, units = "secs")
##deleted any data with a trip in the negative numbers
ride_data_clean <- ride_data_omit %>%
filter(!(ride_length < 0))
##make some room before my computer has a heart attack
rm(ride_data_omit)
##The drop_na function didn't seem to get rid of most of the incomplete data, so I tried something else
ride_data <- ride_data_clean %>%
filter(!(is.na(start_station_name) | start_station_name == ""))
rm(ride_data_clean)
ride_data_clean <- ride_data%>%
filter(!(is.na(end_station_name) | end_station_name == ""))
rm(ride_data)
##Add day of week column
ride_data_clean$day_of_week <- wday(ride_data_clean$started_at)
##I was having trouble with mapping via long/lat on tableau so I shorten the numbers here.
ride_data_clean$start_lat<- format(round(ride_data_clean$start_lat, 3), nsmall = 3)
ride_data_clean$start_lng<- format(round(ride_data_clean$start_lng, 3), nsmall = 3)
ride_data_clean$end_lat<- format(round(ride_data_clean$end_lat, 3), nsmall = 3)
ride_data_clean$end_lng<- format(round(ride_data_clean$end_lng, 3), nsmall = 3)
##Export to csv to switch to tableau
fwrite(ride_data_clean,
"C:\\Users\\craze\\OneDrive\\Documents\\Cyclistic\\ride_data.csv",
col.names = TRUE,row.names = FALSE)