Deliverable for Process Phase
Guiding questions
What tools are you choosing and why?
A huge amount of data is available for analysis starting from the year 2013 till March 2022. But it is required to take the recent data of the past year ie starting from April 2021 to March 2022. R language will be used to do the analysis of the case study
Have you ensured your data’s integrity?
Accuracy
Completeness,
Consistency
Trustworthiness
The above mentioned are the qualities to ensure the data's Integrity. The data is complete in all respects as it contains all the required components to measure the entity and ensures data integrity. As per the check made the data is consistent across the years with all the years having their respective CSV file which is organized in an equal number of columns and same data types. Since the credibility was proven earlier, it is certainly trustworthy.
What steps have you taken to ensure that your data is clean?
All the CSV files of each year are concatenated into a single data frame
All the empty rows and columns are removed from the concatenated data frame.
Count() is used to check the unique values in each variable so that there is no misspelling anywhere.
Removing all N/A values from the entire data frame.
Removing all duplicates
How can you verify that your data is clean and ready to analyze?
with all the cleaning tasks mentioned above are done,the following functions are run to verify:
a) filter() to check if there were any missing values
b) count() to check the unique values of each variable
c) duplicated() to check for any duplicates present.
Have you documented your cleaning process so you can review and share those results?
The codes and snippets used for cleaning and manipulating the data are available as documented below.
Documentation of cleaning and manipulation of data in R
# Install required packages
# tidyverse for data import and wrangling
# lubridate for date functions
# ggplot for visualization
install.packages("tidyverse")
install.packages("lubridate")
install.packages("ggplot2")
install.packages("dplyr")
install.packages("janitor")
library(tidyverse)
library(lubridate)
library(ggplot2)
library(dplyr)
library(janitor)
# STEP 1: COLLECT DATA
# Upload Divvy datasets (csv files) here
apr_2021<-read.csv("202104-divvy-tripdata_rc.csv")
may_2021<-read.csv("202105-divvy-tripdata_rc.csv")
jun_2021<-read.csv("202106-divvy-tripdata_rc.csv")
jul_2021<-read.csv("202107-divvy-tripdata_rc.csv")
aug_2021<-read.csv("202108-divvy-tripdata_rc.csv")
sep_2021<-read.csv("202109-divvy-tripdata_rc.csv")
oct_2021<-read.csv("202110-divvy-tripdata_rc.csv")
nov_2021<-read.csv("202111-divvy-tripdata_rc.csv")
dec_2021<-read.csv("202112-divvy-tripdata_rc.csv")
jan_2022<-read.csv("202201-divvy-tripdata_rc.csv")
feb_2022<-read.csv("202202-divvy-tripdata_rc.csv")
mar_2022<-read.csv("202203-divvy-tripdata_rc.csv")
# STEP 2: WRANGLE DATA AND COMBINE INTO A SINGLE FILE
# Compare column names each of the files
# While the names don't have to be in the same order, they DO need to match perfectly before we can use a command to join them into one file
colnames(apr_2021)
colnames(may_2021)
colnames(jun_2021)
colnames(jul_2021)
colnames(aug_2021)
colnames(sep_2021)
colnames(oct_2021)
colnames(nov_2021)
colnames(dec_2021)
colnames(jan_2022)
colnames(feb_2022)
colnames(mar_2022)
# Convert ride_id and rideable_type to character so that they can stack correctly
apr_2021 <- mutate(apr_2021, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
may_2021 <- mutate(may_2021, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
jun_2021 <- mutate(jun_2021, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
jul_2021 <- mutate(jul_2021, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
aug_2021 <- mutate(aug_2021, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
sep_2021 <- mutate(sep_2021, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
oct_2021 <- mutate(oct_2021, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
nov_2021 <- mutate(nov_2021, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
dec_2021 <- mutate(dec_2021, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
jan_2022 <- mutate(jan_2022, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
feb_2022 <- mutate(feb_2022, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
mar_2022 <- mutate(mar_2022, ride_id = as.character(ride_id)
,rideable_type = as.character(rideable_type))
# Stack individual quarter's data frames into one big data frame
all_trips <- bind_rows(apr_2021,may_2021,jun_2021,jul_2021,aug_2021,sep_2021,oct_2021,nov_2021,dec_2021,jan_2022,feb_2022,mar_2022)
#Removing any empty rows or columns present
and checking for missing values (Check the missing values for each variable)
new_all_trips <- remove_empty(all_trips, which=c("rows", "cols"))
count(filter(all_trips, start_station_name==''),start_station_name, member_casual,sort=TRUE)
#count() returns unique values of the variable passed
new_all_trips %>%
count
#omitting NA values in the entire data frame
new_all_trips <- na.omit(new_all_trips)
#Removing duplicates
new_all_trips<- new_all_trips[!duplicated(new_all_trips$ride_id), ]
# Inspect the new table that has been created
colnames(new_all_trips) #List of column names
nrow(new_all_trips) #How many rows are in data frame?
dim(new_all_trips) #Dimensions of the data frame?
head(new_all_trips) #See the first 6 rows of data frame. Also tail(all_trips)
str(new_all_trips) #See list of columns and data types (numeric, character, etc)
summary(new_all_trips) #Statistical summary of data. Mainly for numerics
# Add columns that list the date, month, day, and year of each ride
# This will allow us to aggregate ride data for each month, day, or year ... before completing these operations we could only aggregate at the ride level
new_all_trips$date <- as.Date(new_all_trips$started_at) #The default format is yyyy-mm-dd
new_all_trips$month <- format(as.Date(new_all_trips$date), "%m")
new_all_trips$day <- format(as.Date(new_all_trips$date), "%d")
new_all_trips$year <- format(as.Date(new_all_trips$date), "%Y")
new_all_trips$day_of_week <- format(as.Date(new_all_trips$date), "%A")
# Add a "ride_length" calculation to all_trips (in seconds)
#as we need the ride duration ; start date and end data are also assigned as Datetime data type
new_all_trips$started_at<-as_datetime(new_all_trips$started_at)
new_all_trips$ended_at<-as_datetime(new_all_trips$ended_at)
new_all_trips$ride_length <- difftime(new_all_trips$ended_at,new_all_trips$started_at)
str(new_all_trips)
# Convert "ride_length" from Factor to numeric so we can run calculations on the data
is.factor(new_all_trips$ride_length)
new_all_trips$ride_length <- as.numeric(as.character(new_all_trips$ride_length))
is.numeric(new_all_trips$ride_length)
#filtering the length of ride less than 0 seconds
all_trips_V2<-filter(new_all_trips,new_all_trips$ride_length>0)
##minimum Length of ride and maximum length of ride
min(all_trips_V2$ride_length)
max(all_trips_V2$ride_length)