Taxi Data
- Download Chicago taxi dataset in 2019.
import pandas as pd
data = pd.read_csv('Taxi_Trips_-_2019.csv')
data.head()
- Process the raw data.
df = pd.DataFrame()
df['Trip Start Timestamp'] = data['Trip Start Timestamp']
df['Trip Seconds'] = data['Trip Seconds']
df['Trip Miles'] = data['Trip Miles']
df['Pickup Community Area'] = data['Pickup Community Area']
df['Dropoff Community Area'] = data['Dropoff Community Area']
del data
df = df.dropna() # Remove rows with NaN
df = df.drop(df[df['Trip Seconds'] == 0].index)
df = df.drop(df[df['Trip Miles'] == 0].index)
df = df.reset_index()
df = df.drop(['index'], axis = 1)
df.to_csv('taxi_trip_2019.csv', index = False)
import numpy as np
print(np.mean(df['Trip Seconds'].values))
print(np.mean(df['Trip Miles'].values))
- Visualize pickup and dropoff trips.
import geopandas as gpd
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
chicago = gpd.read_file("Boundaries_Community_Areas/areas.shp")
pickup_df = df.groupby(['Pickup Community Area']).size().reset_index(name = 'pickup_counts')
dropoff_df = df.groupby(['Dropoff Community Area']).size().reset_index(name = 'dropoff_counts')
pickup_df['area_numbe'] = pickup_df['Pickup Community Area']
dropoff_df['area_numbe'] = dropoff_df['Dropoff Community Area']
chicago['area_numbe'] = chicago.area_numbe.astype(float)
pickup = chicago.set_index('area_numbe').join(pickup_df.set_index('area_numbe')).reset_index()
dropoff = chicago.set_index('area_numbe').join(dropoff_df.set_index('area_numbe')).reset_index()
fig = plt.figure(figsize = (14, 8))
for i in [1, 2]:
ax = fig.add_subplot(1, 2, i)
if i == 1:
pickup.plot('pickup_counts', cmap = 'YlOrRd', legend = True,
legend_kwds = {'shrink': 0.618, 'label': 'Pickup trip count'},
vmin = 0, vmax = 4e+6, ax = ax)
elif i == 2:
dropoff.plot('dropoff_counts', cmap = 'YlOrRd', legend = True,
legend_kwds = {'shrink': 0.618, 'label': 'Dropoff trip count'},
vmin = 0, vmax = 4e+6, ax = ax)
plt.xticks([])
plt.yticks([])
plt.show()
fig.savefig("pickup_dropoff_trips_chicago_2019.png", bbox_inches = "tight")
Ridehailing Data
- Download Chicago rideshare data in 2022, see the download page.
Since the original dataset is of size 18.07 GB and has 69,109,780 rows, we preprocess the dataset and make it smaller (i.e., rideshare_trip_2022.csv
of size 2.93 GB), at least for the future usage.
import pandas as pd
data = pd.DataFrame()
chunksize = 10 ** 7
for chunk in pd.read_csv('Transportation_Network_Providers_-_Trips_-_2022.csv', chunksize = chunksize):
df = pd.DataFrame()
df['Trip Start Timestamp'] = chunk['Trip Start Timestamp']
df['Trip Seconds'] = chunk['Trip Seconds']
df['Trip Miles'] = chunk['Trip Miles']
df['Pickup Community Area'] = chunk['Pickup Community Area']
df['Dropoff Community Area'] = chunk['Dropoff Community Area']
data = data.append(df)
del df
data.to_csv('rideshare_trip_2022.csv', index = False)
- Process the raw data.
import pandas as pd
df = pd.read_csv('rideshare_trip_2022.csv')
df = df.dropna() # Remove rows with NaN
df = df.drop(df[df['Trip Seconds'] == 0].index)
df = df.drop(df[df['Trip Miles'] == 0].index)
df = df.reset_index()
df = df.drop(['index'], axis = 1)
import numpy as np
print(np.mean(df['Trip Seconds'].values))
print(np.mean(df['Trip Miles'].values))
pickup_df = df.groupby(['Pickup Community Area']).size().reset_index(name = 'pickup_counts')
dropoff_df = df.groupby(['Dropoff Community Area']).size().reset_index(name = 'dropoff_counts')
- Visualize pickup and dropoff trips.
import geopandas as gpd
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
chicago = gpd.read_file("Boundaries_Community_Areas/areas.shp")
pickup_df = df.groupby(['Pickup Community Area']).size().reset_index(name = 'pickup_counts')
dropoff_df = df.groupby(['Dropoff Community Area']).size().reset_index(name = 'dropoff_counts')
pickup_df['area_numbe'] = pickup_df['Pickup Community Area']
dropoff_df['area_numbe'] = dropoff_df['Dropoff Community Area']
chicago['area_numbe'] = chicago.area_numbe.astype(float)
pickup = chicago.set_index('area_numbe').join(pickup_df.set_index('area_numbe')).reset_index()
dropoff = chicago.set_index('area_numbe').join(dropoff_df.set_index('area_numbe')).reset_index()
fig = plt.figure(figsize = (14, 8))
for i in [1, 2]:
ax = fig.add_subplot(1, 2, i)
if i == 1:
pickup.plot('pickup_counts', cmap = 'YlOrRd', legend = True,
legend_kwds = {'shrink': 0.618, 'label': 'Pickup trip count'},
vmin = 0, vmax = 1e+7,
ax = ax)
elif i == 2:
dropoff.plot('dropoff_counts', cmap = 'YlOrRd', legend = True,
legend_kwds = {'shrink': 0.618, 'label': 'Dropoff trip count'},
vmin = 0, vmax = 1e+7,
ax = ax)
plt.xticks([])
plt.yticks([])
plt.show()
fig.savefig("tnp_pickup_dropoff_trips_chicago_2022.png", bbox_inches = "tight")
- Download Chicago rideshare data in 2019, see the download page.
Since the original dataset is of size 29.61 GB and has 111,850,744 rows, we preprocess the dataset and make it smaller (i.e., rideshare_trip_2019.csv
of size 4.74 GB), at least for the future usage.
import pandas as pd
data = pd.DataFrame()
chunksize = 10 ** 7
for chunk in pd.read_csv('Transportation_Network_Providers_-_Trips_-_2019.csv', chunksize = chunksize):
df = pd.DataFrame()
df['Trip Start Timestamp'] = chunk['Trip Start Timestamp']
df['Trip Seconds'] = chunk['Trip Seconds']
df['Trip Miles'] = chunk['Trip Miles']
df['Pickup Community Area'] = chunk['Pickup Community Area']
df['Dropoff Community Area'] = chunk['Dropoff Community Area']
data = data.append(df)
del df
data.to_csv('rideshare_trip_2019.csv', index = False)
- Process the raw data.
import pandas as pd
df = pd.read_csv('rideshare_trip_2019.csv')
df = df.dropna() # Remove rows with NaN
df = df.drop(df[df['Trip Seconds'] == 0].index)
df = df.drop(df[df['Trip Miles'] == 0].index)
df = df.reset_index()
df = df.drop(['index'], axis = 1)
import numpy as np
print(np.mean(df['Trip Seconds'].values))
print(np.mean(df['Trip Miles'].values))
pickup_df = df.groupby(['Pickup Community Area']).size().reset_index(name = 'pickup_counts')
dropoff_df = df.groupby(['Dropoff Community Area']).size().reset_index(name = 'dropoff_counts')
- Visualize pickup and dropoff trips.
import geopandas as gpd
import matplotlib.pyplot as plt
plt.rcParams['font.size'] = 14
chicago = gpd.read_file("Boundaries_Community_Areas/areas.shp")
pickup_df = df.groupby(['Pickup Community Area']).size().reset_index(name = 'pickup_counts')
dropoff_df = df.groupby(['Dropoff Community Area']).size().reset_index(name = 'dropoff_counts')
pickup_df['area_numbe'] = pickup_df['Pickup Community Area']
dropoff_df['area_numbe'] = dropoff_df['Dropoff Community Area']
chicago['area_numbe'] = chicago.area_numbe.astype(float)
pickup = chicago.set_index('area_numbe').join(pickup_df.set_index('area_numbe')).reset_index()
dropoff = chicago.set_index('area_numbe').join(dropoff_df.set_index('area_numbe')).reset_index()
fig = plt.figure(figsize = (14, 8))
for i in [1, 2]:
ax = fig.add_subplot(1, 2, i)
if i == 1:
pickup.plot('pickup_counts', cmap = 'YlOrRd', legend = True,
legend_kwds = {'shrink': 0.618, 'label': 'Pickup trip count'},
vmin = 0, vmax = 1.8e+7,
ax = ax)
elif i == 2:
dropoff.plot('dropoff_counts', cmap = 'YlOrRd', legend = True,
legend_kwds = {'shrink': 0.618, 'label': 'Dropoff trip count'},
vmin = 0, vmax = 1.8e+7,
ax = ax)
plt.xticks([])
plt.yticks([])
plt.show()
fig.savefig("tnp_pickup_dropoff_trips_chicago_2019.png", bbox_inches = "tight")