Project 3

About the Data

Data Source:

Two datasets were used to build the application. Both datasets were collected from Chicago Data Portal.

The dataset that contains information about Taxi rides in Chicago for 2019 can be found here. The file size is about 7GB.
The dataset that contains information regarding boundaries for each community area in Chicago can be found here. The file size is about 2MB.

Data Usage:

The taxi trips from the 2019 dataset contain 23 columns of which we are only concerned with looking at 6 of them for this visualization. Trip Start Timestamp is a string that describes when the trip started, rounded to the nearest 15 minutes. Trip Seconds is an integer that describes the time of the trip in seconds. Trip Miles is a float that describes the distance of the trip in miles. Pickup Community Area is an integer that describes the Community Area where the trip began and this column would be blank for locations outside Chicago. Dropoff Community Area is an integer that describes the Community Area where the trip ended and this column would be blank for locations outside Chicago. Company is a string that describes the taxi company that ran that specific taxi service for the trip.

The Boundaries - Community Areas dataset contains information about the boundaries of each community area in Chicago. This data is downloadable in GEOJSON format, which can be further used to describe each community with a color on a map in RStudio.

The EVL shiny-server at UIC was used to publish this project. To make the project responsive and create the start-up time as quickly as possible, we split up the data into various subfolders with their CSV files. Two datasets need to be downloaded, both of which are provided at the bottom of the page. Python script used for splitting the ridership data is provided below:

import os

import numpy as np

import pandas as pd

import csv

from datetime import datetime

import warnings

warnings.filterwarnings("ignore")

def mkdir(dir):

if not os.path.exists(dir):

os.mkdir(dir)

def preprocess(df, dictCompany):

df = df[['Trip Seconds', 'Trip Miles', 'Pickup Community Area', 'Dropoff Community Area', 'Company', 'Trip Start Timestamp']]

df = df[(df['Trip Miles']>=0.5) & (df['Trip Miles']<=100)]

df['tripKM'] = df['Trip Miles'].apply(lambda x: round(x*1.609,2))

df = df[(df['Trip Seconds']>=60) & (df['Trip Seconds']<=18000)]

df['Pickup Community Area'] = df['Pickup Community Area'].mask( ~((df['Pickup Community Area']>=1) & (df['Pickup Community Area']<=77)), 78)

df['Dropoff Community Area'] = df['Dropoff Community Area'].mask( ~((df['Dropoff Community Area']>=1) & (df['Dropoff Community Area']<=77)), 78)

df = df[~((df['Pickup Community Area']==78) & (df['Dropoff Community Area']==78))]

df = df.replace({"Company": dictCompany})

df = df.dropna()

df['Trip Start Timestamp'] = pd.to_datetime(df['Trip Start Timestamp'])

df['Trip Start Timestamp'] = df['Trip Start Timestamp'].apply(lambda x: pd.datetime(x.year, x.month, x.day, x.hour, 0, 0))

df['Pickup Community Area'] = df['Pickup Community Area'].astype(int)

df['Dropoff Community Area'] = df['Dropoff Community Area'].astype(int)

df['Trip Seconds'] = df['Trip Seconds'].astype(int)

df = df.rename(columns={'Trip Seconds': 'tripSeconds', 'Trip Miles': 'tripMiles', 'Pickup Community Area': 'pickupArea', 'Dropoff Community Area': 'dropArea', 'Company': 'company', 'Trip Start Timestamp': 'tripStartTime'})

return df

def preCalc(df, str, drop=False, pick=False, outChicago=False):

df_date = df.groupby([df['tripStartTime'].dt.date]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Date',0: 'Count'})

df_hour = df.groupby([df['tripStartTime'].dt.hour]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Hour', 0: 'Count'})

df_day = df.groupby([df['tripStartTime'].dt.dayofweek]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Day', 0: 'Count'})

# Values: 0 to 6. (0: Monday, 6: Sunday)

df_month = df.groupby([df['tripStartTime'].dt.month]).size().to_frame().reset_index().rename(columns={'tripStartTime': 'Month', 0: 'Count'})

df_mileage_miles = df

ranges_miles = [0.49, 1, 3, 5, 10, 15, 20, 100]

df_mileage_miles['mileage_bin_miles'] = pd.cut(df['tripMiles'], bins=ranges_miles)

df_mileage_miles = df_mileage_miles.groupby([df_mileage_miles['mileage_bin_miles']]).size().to_frame().reset_index().rename(columns={'mileage_bin_miles': 'Mileage_miles', 0: 'Count'})

df_mileage_miles.sort_values(by='Mileage_miles')

labels = ['0.5 - 1', '1 - 3', '3 - 5', '5 - 10', '10 - 15', '15 - 20', '20 - 100']

df_mileage_miles['Mileage_miles'] = df_mileage_miles['Mileage_miles'].cat.rename_categories(labels)

df_mileage_km = df

ranges_km = [0.79, 2, 5, 10, 15, 25, 35, 160.0]

df_mileage_km['mileage_bin_km'] = pd.cut(df['tripKM'], bins=ranges_km)

df_mileage_km = df_mileage_km.groupby([df_mileage_km['mileage_bin_km']]).size().to_frame().reset_index().rename(columns={'mileage_bin_km': 'Mileage_km', 0: 'Count'})

df_mileage_km.sort_values(by='Mileage_km')

labels = ['0.8 - 2', '2 - 5', '5 - 10', '10 - 15', '15 - 25', '25 - 35', '35 - 160']

df_mileage_km['Mileage_km'] = df_mileage_km['Mileage_km'].cat.rename_categories(labels)

df_time = df

ranges = [59.99, 300, 600, 900, 1200, 1800, 3600, np.inf]

df_time['time_bin'] = pd.cut(df['tripSeconds'], bins=ranges)

df_time = df_time.groupby([df_time['time_bin']]).size().to_frame().reset_index().rename(columns={'time_bin': 'timeTaken', 0: 'Count'})

df_time.sort_values(by='timeTaken')

labels = ['1 - 5 min', '5 - 10 min', '10 - 15 min', '15 - 20 min', '20 - 30 min', '1/2 hr - 1 hr', '> 1 hr']

df_time.timeTaken = df_time.timeTaken.cat.rename_categories(labels)

df_date.to_csv(str+'date.csv', index=False)

df_hour.to_csv(str+'hour.csv', index=False)

df_day.to_csv(str+'day.csv', index=False)

df_month.to_csv(str+'month.csv'.format(str), index=False)

df_mileage_miles.to_csv(str+'mileage_miles.csv'.format(str), index=False)

df_mileage_km.to_csv(str+'mileage_km.csv'.format(str), index=False)

df_time.to_csv(str+'time.csv'.format(str), index=False)

if(drop):

dfDrop = df.groupby([df['dropArea']]).size().to_frame().reset_index().rename(columns={0: 'Percentage'})

dfDrop['Percentage'] = round((100. * dfDrop['Percentage'] / dfDrop['Percentage'].sum()),2)

if(outChicago):

n=79

else:

n=78

for i in range(1,n):

if(i not in dfDrop['dropArea'].tolist()):

add = {'dropArea':i, 'Percentage':0}

dfDrop = dfDrop.append(add, ignore_index = True)

dfDrop = dfDrop.sort_values(by=['dropArea'],ignore_index=True)

dfDrop.to_csv(str+'drop.csv'.format(str), index=False)

if(pick):

dfPick = df.groupby([df['pickupArea']]).size().to_frame().reset_index().rename(columns={0: 'Percentage'})

dfPick['Percentage'] = round((100. * dfPick['Percentage'] / dfPick['Percentage'].sum()),2)

if(outChicago):

n=79

else:

n=78

for i in range(1,n):

if(i not in dfPick['pickupArea'].tolist()):

add = {'pickupArea':i, 'Percentage':0}

dfPick = dfPick.append(add, ignore_index = True)

dfPick = dfPick.sort_values(by=['pickupArea'],ignore_index=True)

dfPick.to_csv(str+'pick.csv'.format(str), index=False)

filename = "Taxi_Trips_-_2019.csv"

df = pd.read_csv(filename)

with open('taxiDict.csv') as csv_file:

reader = csv.reader(csv_file)

mydict = dict(reader)

dictCompany = dict((v, int(k)) for k, v in mydict.items())

dfTaxi = preprocess(df, dictCompany)

# All Areas All Taxi

mkdir("allAreaAllTaxi")

mkdir("allAreaAllTaxi/outsideCity/")

mkdir("allAreaAllTaxi")

mkdir("allAreaAllTaxi/onlyCity/")

preCalc(dfTaxi, "allAreaAllTaxi/outsideCity/")

dfTaxiSub = dfTaxi[~((dfTaxi['pickupArea']==78) | (dfTaxi['dropArea']==78))]

preCalc(dfTaxiSub, "allAreaAllTaxi/onlyCity/")

# All Taxi Pickup Areas

mkdir("allTaxi")

mkdir("allTaxi/From")

for i in range(1,78):

dir = "allTaxi/From/Area-{}/".format(i)

mkdir(dir)

sub = dfTaxi[dfTaxi['pickupArea']==i]

dir = "allTaxi/From/Area-{}/outsideCity/".format(i)

mkdir(dir)

preCalc(sub, dir, True, False, True)

sub = dfTaxiSub[dfTaxiSub['pickupArea']==i]

dir = "allTaxi/From/Area-{}/onlyCity/".format(i)

mkdir(dir)

preCalc(sub, dir, True, False, False)

# All Taxi Dropoff Areas

# Percentage Coming From

mkdir("allArea")

mkdir("allTaxi")

mkdir("allTaxi/To")

for i in range(1,78):

dir = "allTaxi/To/Area-{}/".format(i)

mkdir(dir)

sub = dfTaxi[dfTaxi['dropArea']==i]

dir = "allTaxi/To/Area-{}/outsideCity/".format(i)

mkdir(dir)

preCalc(sub, dir, False, True, True)

sub = dfTaxiSub[dfTaxiSub['dropArea']==i]

dir = "allTaxi/To/Area-{}/onlyCity/".format(i)

mkdir(dir)

preCalc(sub, dir, False, True, False)

# All Areas

mkdir("allArea")

for i in range(55):

dir = "allArea/Taxi-{}/".format(i)

mkdir(dir)

sub = dfTaxi[dfTaxi['company']==i]

dir = "allArea/Taxi-{}/outsideCity/".format(i)

mkdir(dir)

preCalc(sub, dir)

sub = dfTaxiSub[dfTaxiSub['company']==i]

dir = "allArea/Taxi-{}//onlyCity/".format(i)

mkdir(dir)

preCalc(sub, dir)

# Taxi - Area (To/From) All Combinations

mkdir("allCombination")

for i in range(55):

dir = "allCombination/Taxi-{}/".format(i)

mkdir(dir)

mkdir(dir+"From/")

mkdir(dir+"To/")

for j in range(1,78):

dir = "allCombination/Taxi-{}/From/Area-{}".format(i,j)

mkdir(dir)

dir = "allCombination/Taxi-{}/From/Area-{}/onlyCity/".format(i,j)

mkdir(dir)

sub = dfTaxiSub[(dfTaxiSub['company']==i) & (dfTaxiSub['pickupArea']==j)]

preCalc(sub, dir, True, False, False)

dir = "allCombination/Taxi-{}/From/Area-{}/outsideCity/".format(i,j)

mkdir(dir)

sub = dfTaxi[(dfTaxi['company']==i) & (dfTaxi['pickupArea']==j)]

preCalc(sub, dir, True, False, True)

dir = "allCombination/Taxi-{}/To/Area-{}".format(i,j)

mkdir(dir)

dir = "allCombination/Taxi-{}/To/Area-{}/onlyCity/".format(i,j)

mkdir(dir)

sub = dfTaxiSub[(dfTaxiSub['company']==i) & (dfTaxiSub['dropArea']==j)]

preCalc(sub, dir, False, True, False)

dir = "allCombination/Taxi-{}/To/Area-{}/outsideCity/".format(i,j)

mkdir(dir)

sub = dfTaxi[(dfTaxi['company']==i) & (dfTaxi['dropArea']==j)]

preCalc(sub, dir, False, True, True)

Page updated

Google Sites

Report abuse