Two datasets were used to build the application. Both datasets were collected from Chicago Data Portal.
The taxi trips from the 2019 dataset contain 23 columns of which we are only concerned with looking at 6 of them for this visualization. Trip Start Timestamp is a string that describes when the trip started, rounded to the nearest 15 minutes. Trip Seconds is an integer that describes the time of the trip in seconds. Trip Miles is a float that describes the distance of the trip in miles. Pickup Community Area is an integer that describes the Community Area where the trip began and this column would be blank for locations outside Chicago. Dropoff Community Area is an integer that describes the Community Area where the trip ended and this column would be blank for locations outside Chicago. Company is a string that describes the taxi company that ran that specific taxi service for the trip.
The Boundaries - Community Areas dataset contains information about the boundaries of each community area in Chicago. This data is downloadable in GEOJSON format, which can be further used to describe each community with a color on a map in RStudio.
The EVL shiny-server at UIC was used to publish this project. To make the project responsive and create the start-up time as quickly as possible, we split up the data into various subfolders with their CSV files. Two datasets need to be downloaded, both of which are provided at the bottom of the page. Python script used for splitting the ridership data is provided below:
import os
import numpy as np
import pandas as pd
import csv
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")
def mkdir(dir):
if not os.path.exists(dir):
os.mkdir(dir)
def preprocess(df, dictCompany):
df = df[['Trip Seconds', 'Trip Miles', 'Pickup Community Area', 'Dropoff Community Area', 'Company', 'Trip Start Timestamp']]
df = df[(df['Trip Miles']>=0.5) & (df['Trip Miles']<=100)]
df['tripKM'] = df['Trip Miles'].apply(lambda x: round(x*1.609,2))
df = df[(df['Trip Seconds']>=60) & (df['Trip Seconds']<=18000)]
df['Pickup Community Area'] = df['Pickup Community Area'].mask( ~((df['Pickup Community Area']>=1) & (df['Pickup Community Area']<=77)), 78)
df['Dropoff Community Area'] = df['Dropoff Community Area'].mask( ~((df['Dropoff Community Area']>=1) & (df['Dropoff Community Area']<=77)), 78)
df = df[~((df['Pickup Community Area']==78) & (df['Dropoff Community Area']==78))]
df = df.replace({"Company": dictCompany})
df = df.dropna()
df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'])
df['Trip Start Timestamp'] = df['Trip Start Timestamp'].apply(lambda x: pd.datetime(x.year, x.month, x.day, x.hour, 0, 0))
df['Pickup Community Area'] = df['Pickup Community Area'].astype(int)
df['Dropoff Community Area'] = df['Dropoff Community Area'].astype(int)
df['Trip Seconds'] = df['Trip Seconds'].astype(int)
df = df.rename(columns={'Trip Seconds': 'tripSeconds', 'Trip Miles': 'tripMiles', 'Pickup Community Area': 'pickupArea', 'Dropoff Community Area': 'dropArea', 'Company': 'company', 'Trip Start Timestamp': 'tripStartTime'})
return df
def preCalc(df, str, drop=False, pick=False, outChicago=False):
df_date = df.groupby([df['tripStartTime'].dt.date]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Date',0: 'Count'})
df_hour = df.groupby([df['tripStartTime'].dt.hour]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Hour', 0: 'Count'})
df_day = df.groupby([df['tripStartTime'].dt.dayofweek]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Day', 0: 'Count'})
# Values: 0 to 6. (0: Monday, 6: Sunday)
df_month = df.groupby([df['tripStartTime'].dt.month]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Month', 0: 'Count'})
df_mileage_miles = df
ranges_miles = [0.49, 1, 3, 5, 10, 15, 20, 100]
df_mileage_miles['mileage_bin_miles'] = pd.cut(df['tripMiles'], bins=ranges_miles)
df_mileage_miles = df_mileage_miles.groupby([df_mileage_miles['mileage_bin_miles']]).size().to_frame().reset_index().rename(columns={'mileage_bin_miles': 'Mileage_miles', 0: 'Count'})
df_mileage_miles.sort_values(by='Mileage_miles')
labels = ['0.5 - 1', '1 - 3', '3 - 5', '5 - 10', '10 - 15', '15 - 20', '20 - 100']
df_mileage_miles['Mileage_miles'] = df_mileage_miles['Mileage_miles'].cat.rename_categories(labels)
df_mileage_km = df
ranges_km = [0.79, 2, 5, 10, 15, 25, 35, 160.0]
df_mileage_km['mileage_bin_km'] = pd.cut(df['tripKM'], bins=ranges_km)
df_mileage_km = df_mileage_km.groupby([df_mileage_km['mileage_bin_km']]).size().to_frame().reset_index().rename(columns={'mileage_bin_km': 'Mileage_km', 0: 'Count'})
df_mileage_km.sort_values(by='Mileage_km')
labels = ['0.8 - 2', '2 - 5', '5 - 10', '10 - 15', '15 - 25', '25 - 35', '35 - 160']
df_mileage_km['Mileage_km'] = df_mileage_km['Mileage_km'].cat.rename_categories(labels)
df_time = df
ranges = [59.99, 300, 600, 900, 1200, 1800, 3600, np.inf]
df_time['time_bin'] = pd.cut(df['tripSeconds'], bins=ranges)
df_time = df_time.groupby([df_time['time_bin']]).size().to_frame().reset_index().rename(columns={'time_bin': 'timeTaken', 0: 'Count'})
df_time.sort_values(by='timeTaken')
labels = ['1 - 5 min', '5 - 10 min', '10 - 15 min', '15 - 20 min', '20 - 30 min', '1/2 hr - 1 hr', '> 1 hr']
df_time.timeTaken = df_time.timeTaken.cat.rename_categories(labels)
df_date.to_csv(str+'date.csv', index=False)
df_hour.to_csv(str+'hour.csv', index=False)
df_day.to_csv(str+'day.csv', index=False)
df_month.to_csv(str+'month.csv'.format(str), index=False)
df_mileage_miles.to_csv(str+'mileage_miles.csv'.format(str), index=False)
df_mileage_km.to_csv(str+'mileage_km.csv'.format(str), index=False)
df_time.to_csv(str+'time.csv'.format(str), index=False)
if(drop):
dfDrop = df.groupby([df['dropArea']]).size().to_frame().reset_index().rename(columns={0: 'Percentage'})
dfDrop['Percentage'] = round((100. * dfDrop['Percentage'] / dfDrop['Percentage'].sum()),2)
if(outChicago):
n=79
else:
n=78
for i in range(1,n):
if(i not in dfDrop['dropArea'].tolist()):
add = {'dropArea':i, 'Percentage':0}
dfDrop = dfDrop.append(add, ignore_index = True)
dfDrop = dfDrop.sort_values(by=['dropArea'],ignore_index=True)
dfDrop.to_csv(str+'drop.csv'.format(str), index=False)
if(pick):
dfPick = df.groupby([df['pickupArea']]).size().to_frame().reset_index().rename(columns={0: 'Percentage'})
dfPick['Percentage'] = round((100. * dfPick['Percentage'] / dfPick['Percentage'].sum()),2)
if(outChicago):
n=79
else:
n=78
for i in range(1,n):
if(i not in dfPick['pickupArea'].tolist()):
add = {'pickupArea':i, 'Percentage':0}
dfPick = dfPick.append(add, ignore_index = True)
dfPick = dfPick.sort_values(by=['pickupArea'],ignore_index=True)
dfPick.to_csv(str+'pick.csv'.format(str), index=False)
filename = "Taxi_Trips_-_2019.csv"
df = pd.read_csv(filename)
with open('taxiDict.csv') as csv_file:
reader = csv.reader(csv_file)
mydict = dict(reader)
dictCompany = dict((v, int(k)) for k, v in mydict.items())
dfTaxi = preprocess(df, dictCompany)
# All Areas All Taxi
mkdir("allAreaAllTaxi")
mkdir("allAreaAllTaxi/outsideCity/")
mkdir("allAreaAllTaxi")
mkdir("allAreaAllTaxi/onlyCity/")
preCalc(dfTaxi, "allAreaAllTaxi/outsideCity/")
dfTaxiSub = dfTaxi[~((dfTaxi['pickupArea']==78) | (dfTaxi['dropArea']==78))]
preCalc(dfTaxiSub, "allAreaAllTaxi/onlyCity/")
# All Taxi Pickup Areas
mkdir("allTaxi")
mkdir("allTaxi/From")
for i in range(1,78):
dir = "allTaxi/From/Area-{}/".format(i)
mkdir(dir)
sub = dfTaxi[dfTaxi['pickupArea']==i]
dir = "allTaxi/From/Area-{}/outsideCity/".format(i)
mkdir(dir)
preCalc(sub, dir, True, False, True)
sub = dfTaxiSub[dfTaxiSub['pickupArea']==i]
dir = "allTaxi/From/Area-{}/onlyCity/".format(i)
mkdir(dir)
preCalc(sub, dir, True, False, False)
# All Taxi Dropoff Areas
# Percentage Coming From
mkdir("allArea")
mkdir("allTaxi")
mkdir("allTaxi/To")
for i in range(1,78):
dir = "allTaxi/To/Area-{}/".format(i)
mkdir(dir)
sub = dfTaxi[dfTaxi['dropArea']==i]
dir = "allTaxi/To/Area-{}/outsideCity/".format(i)
mkdir(dir)
preCalc(sub, dir, False, True, True)
sub = dfTaxiSub[dfTaxiSub['dropArea']==i]
dir = "allTaxi/To/Area-{}/onlyCity/".format(i)
mkdir(dir)
preCalc(sub, dir, False, True, False)
# All Areas
mkdir("allArea")
for i in range(55):
dir = "allArea/Taxi-{}/".format(i)
mkdir(dir)
sub = dfTaxi[dfTaxi['company']==i]
dir = "allArea/Taxi-{}/outsideCity/".format(i)
mkdir(dir)
preCalc(sub, dir)
sub = dfTaxiSub[dfTaxiSub['company']==i]
dir = "allArea/Taxi-{}//onlyCity/".format(i)
mkdir(dir)
preCalc(sub, dir)
# Taxi - Area (To/From) All Combinations
mkdir("allCombination")
for i in range(55):
dir = "allCombination/Taxi-{}/".format(i)
mkdir(dir)
mkdir(dir+"From/")
mkdir(dir+"To/")
for j in range(1,78):
dir = "allCombination/Taxi-{}/From/Area-{}".format(i,j)
mkdir(dir)
dir = "allCombination/Taxi-{}/From/Area-{}/onlyCity/".format(i,j)
mkdir(dir)
sub = dfTaxiSub[(dfTaxiSub['company']==i) & (dfTaxiSub['pickupArea']==j)]
preCalc(sub, dir, True, False, False)
dir = "allCombination/Taxi-{}/From/Area-{}/outsideCity/".format(i,j)
mkdir(dir)
sub = dfTaxi[(dfTaxi['company']==i) & (dfTaxi['pickupArea']==j)]
preCalc(sub, dir, True, False, True)
dir = "allCombination/Taxi-{}/To/Area-{}".format(i,j)
mkdir(dir)
dir = "allCombination/Taxi-{}/To/Area-{}/onlyCity/".format(i,j)
mkdir(dir)
sub = dfTaxiSub[(dfTaxiSub['company']==i) & (dfTaxiSub['dropArea']==j)]
preCalc(sub, dir, False, True, False)
dir = "allCombination/Taxi-{}/To/Area-{}/outsideCity/".format(i,j)
mkdir(dir)
sub = dfTaxi[(dfTaxi['company']==i) & (dfTaxi['dropArea']==j)]
preCalc(sub, dir, False, True, True)