In [1]:

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20200107224120-0000
KERNEL_ID = 8769c21d-8f83-43fb-b378-61ecbf0924a2
Out[1]:
[Row(CRASH DATE='07/16/2019', CRASH TIME='13:10', BOROUGH=None, ZIP CODE=None, LATITUDE='40.710876', LONGITUDE='-74.01439', LOCATION='POINT (-74.01439 40.710876)', ON STREET NAME='WEST STREET                     ', CROSS STREET NAME=None, OFF STREET NAME=None, NUMBER OF PERSONS INJURED='0', NUMBER OF PERSONS KILLED='0', NUMBER OF PEDESTRIANS INJURED='0', NUMBER OF PEDESTRIANS KILLED='0', NUMBER OF CYCLIST INJURED='0', NUMBER OF CYCLIST KILLED='0', NUMBER OF MOTORIST INJURED='0', NUMBER OF MOTORIST KILLED='0', CONTRIBUTING FACTOR VEHICLE 1='Unspecified', CONTRIBUTING FACTOR VEHICLE 2='Unspecified', CONTRIBUTING FACTOR VEHICLE 3=None, CONTRIBUTING FACTOR VEHICLE 4=None, CONTRIBUTING FACTOR VEHICLE 5=None, COLLISION_ID='4171729', VEHICLE TYPE CODE 1='Taxi', VEHICLE TYPE CODE 2='Sedan', VEHICLE TYPE CODE 3=None, VEHICLE TYPE CODE 4=None, VEHICLE TYPE CODE 5=None),
 Row(CRASH DATE='08/05/2019', CRASH TIME='12:07', BOROUGH='BROOKLYN', ZIP CODE='11207', LATITUDE='40.685795', LONGITUDE='-73.911644', LOCATION='POINT (-73.911644 40.685795)', ON STREET NAME='BUSHWICK AVENUE                 ', CROSS STREET NAME='SCHAEFER STREET', OFF STREET NAME=None, NUMBER OF PERSONS INJURED='1', NUMBER OF PERSONS KILLED='0', NUMBER OF PEDESTRIANS INJURED='0', NUMBER OF PEDESTRIANS KILLED='0', NUMBER OF CYCLIST INJURED='0', NUMBER OF CYCLIST KILLED='0', NUMBER OF MOTORIST INJURED='1', NUMBER OF MOTORIST KILLED='0', CONTRIBUTING FACTOR VEHICLE 1='Following Too Closely', CONTRIBUTING FACTOR VEHICLE 2='Unspecified', CONTRIBUTING FACTOR VEHICLE 3=None, CONTRIBUTING FACTOR VEHICLE 4=None, CONTRIBUTING FACTOR VEHICLE 5=None, COLLISION_ID='4183578', VEHICLE TYPE CODE 1='Taxi', VEHICLE TYPE CODE 2='Station Wagon/Sport Utility Vehicle', VEHICLE TYPE CODE 3=None, VEHICLE TYPE CODE 4=None, VEHICLE TYPE CODE 5=None),
 Row(CRASH DATE='07/26/2019', CRASH TIME='8:45', BOROUGH=None, ZIP CODE=None, LATITUDE='40.715633', LONGITUDE='-73.9985', LOCATION='POINT (-73.9985 40.715633)', ON STREET NAME='BAYARD STREET                   ', CROSS STREET NAME=None, OFF STREET NAME=None, NUMBER OF PERSONS INJURED='0', NUMBER OF PERSONS KILLED='0', NUMBER OF PEDESTRIANS INJURED='0', NUMBER OF PEDESTRIANS KILLED='0', NUMBER OF CYCLIST INJURED='0', NUMBER OF CYCLIST KILLED='0', NUMBER OF MOTORIST INJURED='0', NUMBER OF MOTORIST KILLED='0', CONTRIBUTING FACTOR VEHICLE 1='Other Vehicular', CONTRIBUTING FACTOR VEHICLE 2=None, CONTRIBUTING FACTOR VEHICLE 3=None, CONTRIBUTING FACTOR VEHICLE 4=None, CONTRIBUTING FACTOR VEHICLE 5=None, COLLISION_ID='4177333', VEHICLE TYPE CODE 1='Sedan', VEHICLE TYPE CODE 2='Box Truck', VEHICLE TYPE CODE 3=None, VEHICLE TYPE CODE 4=None, VEHICLE TYPE CODE 5=None),
 Row(CRASH DATE='08/08/2019', CRASH TIME='18:00', BOROUGH=None, ZIP CODE=None, LATITUDE='40.79536', LONGITUDE='-73.96942', LOCATION='POINT (-73.96942 40.79536)', ON STREET NAME='AMSTERDAM AVENUE                ', CROSS STREET NAME=None, OFF STREET NAME=None, NUMBER OF PERSONS INJURED='0', NUMBER OF PERSONS KILLED='0', NUMBER OF PEDESTRIANS INJURED='0', NUMBER OF PEDESTRIANS KILLED='0', NUMBER OF CYCLIST INJURED='0', NUMBER OF CYCLIST KILLED='0', NUMBER OF MOTORIST INJURED='0', NUMBER OF MOTORIST KILLED='0', CONTRIBUTING FACTOR VEHICLE 1='Unspecified', CONTRIBUTING FACTOR VEHICLE 2='Unspecified', CONTRIBUTING FACTOR VEHICLE 3=None, CONTRIBUTING FACTOR VEHICLE 4=None, CONTRIBUTING FACTOR VEHICLE 5=None, COLLISION_ID='4185499', VEHICLE TYPE CODE 1='Sedan', VEHICLE TYPE CODE 2='Sedan', VEHICLE TYPE CODE 3=None, VEHICLE TYPE CODE 4=None, VEHICLE TYPE CODE 5=None),
 Row(CRASH DATE='07/17/2019', CRASH TIME='5:35', BOROUGH='BROOKLYN', ZIP CODE='11236', LATITUDE='40.642605', LONGITUDE='-73.91558', LOCATION='POINT (-73.91558 40.642605)', ON STREET NAME=None, CROSS STREET NAME=None, OFF STREET NAME='8412      FOSTER AVENUE                 ', NUMBER OF PERSONS INJURED='0', NUMBER OF PERSONS KILLED='0', NUMBER OF PEDESTRIANS INJURED='0', NUMBER OF PEDESTRIANS KILLED='0', NUMBER OF CYCLIST INJURED='0', NUMBER OF CYCLIST KILLED='0', NUMBER OF MOTORIST INJURED='0', NUMBER OF MOTORIST KILLED='0', CONTRIBUTING FACTOR VEHICLE 1='Backing Unsafely', CONTRIBUTING FACTOR VEHICLE 2='Unspecified', CONTRIBUTING FACTOR VEHICLE 3=None, CONTRIBUTING FACTOR VEHICLE 4=None, CONTRIBUTING FACTOR VEHICLE 5=None, COLLISION_ID='4172507', VEHICLE TYPE CODE 1='Pick-up Truck', VEHICLE TYPE CODE 2='Sedan', VEHICLE TYPE CODE 3=None, VEHICLE TYPE CODE 4=None, VEHICLE TYPE CODE 5=None)]
In [2]:
collisions.cache()
Out[2]:
DataFrame[CRASH DATE: string, CRASH TIME: string, BOROUGH: string, ZIP CODE: string, LATITUDE: string, LONGITUDE: string, LOCATION: string, ON STREET NAME: string, CROSS STREET NAME: string, OFF STREET NAME: string, NUMBER OF PERSONS INJURED: string, NUMBER OF PERSONS KILLED: string, NUMBER OF PEDESTRIANS INJURED: string, NUMBER OF PEDESTRIANS KILLED: string, NUMBER OF CYCLIST INJURED: string, NUMBER OF CYCLIST KILLED: string, NUMBER OF MOTORIST INJURED: string, NUMBER OF MOTORIST KILLED: string, CONTRIBUTING FACTOR VEHICLE 1: string, CONTRIBUTING FACTOR VEHICLE 2: string, CONTRIBUTING FACTOR VEHICLE 3: string, CONTRIBUTING FACTOR VEHICLE 4: string, CONTRIBUTING FACTOR VEHICLE 5: string, COLLISION_ID: string, VEHICLE TYPE CODE 1: string, VEHICLE TYPE CODE 2: string, VEHICLE TYPE CODE 3: string, VEHICLE TYPE CODE 4: string, VEHICLE TYPE CODE 5: string]
In [3]:
# STEP 3 - Print Schema
print("Records: {}".format(collisions.count()))
collisions.printSchema()
Records: 1632435
root
 |-- CRASH DATE: string (nullable = true)
 |-- CRASH TIME: string (nullable = true)
 |-- BOROUGH: string (nullable = true)
 |-- ZIP CODE: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- LOCATION: string (nullable = true)
 |-- ON STREET NAME: string (nullable = true)
 |-- CROSS STREET NAME: string (nullable = true)
 |-- OFF STREET NAME: string (nullable = true)
 |-- NUMBER OF PERSONS INJURED: string (nullable = true)
 |-- NUMBER OF PERSONS KILLED: string (nullable = true)
 |-- NUMBER OF PEDESTRIANS INJURED: string (nullable = true)
 |-- NUMBER OF PEDESTRIANS KILLED: string (nullable = true)
 |-- NUMBER OF CYCLIST INJURED: string (nullable = true)
 |-- NUMBER OF CYCLIST KILLED: string (nullable = true)
 |-- NUMBER OF MOTORIST INJURED: string (nullable = true)
 |-- NUMBER OF MOTORIST KILLED: string (nullable = true)
 |-- CONTRIBUTING FACTOR VEHICLE 1: string (nullable = true)
 |-- CONTRIBUTING FACTOR VEHICLE 2: string (nullable = true)
 |-- CONTRIBUTING FACTOR VEHICLE 3: string (nullable = true)
 |-- CONTRIBUTING FACTOR VEHICLE 4: string (nullable = true)
 |-- CONTRIBUTING FACTOR VEHICLE 5: string (nullable = true)
 |-- COLLISION_ID: string (nullable = true)
 |-- VEHICLE TYPE CODE 1: string (nullable = true)
 |-- VEHICLE TYPE CODE 2: string (nullable = true)
 |-- VEHICLE TYPE CODE 3: string (nullable = true)
 |-- VEHICLE TYPE CODE 4: string (nullable = true)
 |-- VEHICLE TYPE CODE 5: string (nullable = true)

In [4]:
# STEP 4 - Load visualization packages
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
sns.set_style("white")
plt.rcParams['figure.figsize'] = (12, 12)
In [5]:
# STEP 5 - SCRUB
collisions_df = collisions

collisions_pd = collisions_df[collisions_df['LATITUDE'] != 0][['LATITUDE', 
                                                               'LONGITUDE', 
                                                               'CRASH DATE', 
                                                               'CRASH TIME', 
                                                               'BOROUGH', 
                                                               'ON STREET NAME', 
                                                               'CROSS STREET NAME',
                                                               'NUMBER OF PERSONS INJURED', 
                                                               'NUMBER OF PERSONS KILLED',
                                                               'CONTRIBUTING FACTOR VEHICLE 1']].toPandas()

collisions_pd.columns = ['Latitude', 'Longitude', 'Date', 'Time', 'Borough', 'On Street',
                         'Cross Street', 'Persons Injured', 'Persons Killed', 'Contributing Factor']

collisions_pd['Latitude'] = collisions_pd['Latitude'].astype(float)
collisions_pd['Longitude'] = collisions_pd['Longitude'].astype(float)
collisions_pd['Persons Killed'] = collisions_pd['Persons Killed'].astype(float)
collisions_pd['Persons Injured'] = collisions_pd['Persons Injured'].astype(float)



#divide dataset in accidents which are: fatal, non-lethal but with person damage, non of the above
killed_pd = collisions_pd[collisions_pd['Persons Killed']!=0]
injured_pd = collisions_pd[np.logical_and(collisions_pd['Persons Injured']!=0, collisions_pd['Persons Killed']==0)]
nothing_pd = collisions_pd[np.logical_and(collisions_pd['Persons Killed']==0, collisions_pd['Persons Injured']==0)]
In [6]:
# STEP 6 - EXPLORE
#create scatterplots
plt.figure(figsize=(20,20))
plt.scatter(collisions_pd.Longitude, collisions_pd.Latitude, alpha=0.05, s=4, color='darkseagreen')

#adjust more settings
plt.title('Motor Vehicle Collisions in New York City', size=25)
plt.xlim((-74.26,-73.7))
plt.ylim((40.5,40.92))
plt.xlabel('Longitude',size=20)
plt.ylabel('Latitude',size=20)

plt.show()
In [43]:
borough = collisions_df.groupby('BOROUGH').count().sort('count').toPandas().iloc[1:,:]
borough['BOROUGH'] = [u'STATEN ISLAND', u'BRONX', u'MANHATTAN', u'QUEENS',u'BROOKLYN',u'None']
colors = ['g','0.75','y','k','b','r']
borough.sort_values(by='count', ascending=True)['count'].plot.barh(color=colors)
plt.xlabel('Collisions')
plt.ylabel('Borough')
plt.title('Total Number of Collisions by Borough', size=15)
plt.yticks(range(1,6), borough['BOROUGH'])
plt.tight_layout()
plt.show()
In [10]:
## Because I love playing around, I wanted to "automate" this instead of typing it out for each Borough
manhattan = collisions_pd[collisions_pd['Borough'] == 'MANHATTAN']
all_boroughs = collisions_pd.Borough.unique()

## FIRST ATTEMPT
## creating a dictionary

# d = {"{}".format(val): collisions_pd[collisions_pd['Borough'] == val] for val in all_boroughs}
# for b in all_boroughs:
#     if b != None:
#         print(d[b].head())

## And iteratively assigning variables... by copying and printing print statments
# for b in all_boroughs:
#     if b != None:
#         subset = "= d['"+b+"']"
#         print(b.lower(), subset)
In [114]:
plt.figure(figsize=(20,20))

plt.scatter(nothing_pd.Longitude, nothing_pd.Latitude, alpha=0.04, s=1, color = 'blue')
plt.scatter(injured_pd.Longitude, injured_pd.Latitude, alpha=0.1, s=1, color = 'yellow')
plt.scatter(killed_pd.Longitude, killed_pd.Latitude,s=5, color = 'red')

blue_patch = mpatches.Patch( label= 'car body damage', alpha = 0.2, color = 'blue')
yellow_patch = mpatches.Patch( label= 'personal injury', alpha = 0.5, color = 'yellow')
red_patch = mpatches.Patch( label= 'lethal accidents', color = 'red')
plt.legend([blue_patch, yellow_patch, red_patch], ('car body damage', 'personal injury', 'fatal accidents'), 
           loc='upper left', prop={'size':20})

plt.title('Severity of Motor Vehicle Collisions in New York City', size=20)
plt.xlim((-74.26,-73.7))
plt.ylim((40.5,40.92))
plt.xlabel('Longitude', size=20)
plt.ylabel('Latitude', size=20)
plt.savefig('NYCaccidents.png')
plt.show()
In [85]:
## What are the other things we need to look at now regarding traffic?
## Should we condition it on population data
## How much is population density and how much is population movement?
In [99]:
## SECOND ATTEMPT 
# I realized that I could just print out the statements using string interpolation (DUH)
# And then copy paste those statements below (much faster)
all_boroughs
Out[99]:
array([None, 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'BRONX', 'STATEN ISLAND'],
      dtype=object)
In [107]:
## CREATE SUBSET STATEMENTS
for b in all_boroughs:
    if b != None:
        subset = "= collisions_pd[collisions_pd['Borough'] == '"+b+"']"
        print(b.lower().replace(' ',''), subset)
brooklyn = collisions_pd[collisions_pd['Borough'] == 'BROOKLYN']
manhattan = collisions_pd[collisions_pd['Borough'] == 'MANHATTAN']
queens = collisions_pd[collisions_pd['Borough'] == 'QUEENS']
bronx = collisions_pd[collisions_pd['Borough'] == 'BRONX']
statenisland = collisions_pd[collisions_pd['Borough'] == 'STATEN ISLAND']
In [116]:
## CREATE SCATTER PLOT STATEMENTS
colors = "white blue black red green yellow"
colors = colors.split()

for borough, c in zip(all_boroughs, colors):
    if borough != None:
        b =  borough.lower().replace(' ', '')
        statement = 'plt.scatter('+b+'.Longitude,'+b+'.Latitude, s=1, color="'+c+'", marker=".")'
        print(statement)
plt.scatter(brooklyn.Longitude,brooklyn.Latitude, s=1, color="blue", marker=".")
plt.scatter(manhattan.Longitude,manhattan.Latitude, s=1, color="black", marker=".")
plt.scatter(queens.Longitude,queens.Latitude, s=1, color="red", marker=".")
plt.scatter(bronx.Longitude,bronx.Latitude, s=1, color="green", marker=".")
plt.scatter(statenisland.Longitude,statenisland.Latitude, s=1, color="yellow", marker=".")
In [117]:
## CREATE LEGEND STATEMENTS
for borough, c in zip(all_boroughs, colors):
    if borough != None:
        b =  borough.title()
        statement = c+"_patch = mpatches.Patch(color='"+c+"', label='"+b+"')"
        print(statement)
blue_patch = mpatches.Patch(color='blue', label='Brooklyn')
black_patch = mpatches.Patch(color='black', label='Manhattan')
red_patch = mpatches.Patch(color='red', label='Queens')
green_patch = mpatches.Patch(color='green', label='Bronx')
yellow_patch = mpatches.Patch(color='yellow', label='Staten Island')
In [120]:
plt.figure(figsize=(20,20))
brooklyn = collisions_pd[collisions_pd['Borough'] == 'BROOKLYN']
manhattan = collisions_pd[collisions_pd['Borough'] == 'MANHATTAN']
queens = collisions_pd[collisions_pd['Borough'] == 'QUEENS']
bronx = collisions_pd[collisions_pd['Borough'] == 'BRONX']
statenisland = collisions_pd[collisions_pd['Borough'] == 'STATEN ISLAND']
                             
plt.scatter(brooklyn.Longitude,brooklyn.Latitude, s=1, color="blue", marker=".")
plt.scatter(manhattan.Longitude,manhattan.Latitude, s=1, color="black", marker=".")
plt.scatter(queens.Longitude,queens.Latitude, s=1, color="red", marker=".")
plt.scatter(bronx.Longitude,bronx.Latitude, s=1, color="green", marker=".")
plt.scatter(statenisland.Longitude,statenisland.Latitude, s=1, color="yellow", marker=".")

blue_patch = mpatches.Patch(color='blue', label='Brooklyn')
black_patch = mpatches.Patch(color='black', label='Manhattan')
red_patch = mpatches.Patch(color='red', label='Queens')
green_patch = mpatches.Patch(color='green', label='Bronx')
yellow_patch = mpatches.Patch(color='yellow', label='Staten Island')

plt.legend([blue_patch, black_patch, red_patch, green_patch, yellow_patch], ([b.title() for b in all_boroughs if b != None]), 
           loc='upper left', prop={'size':20})

plt.title('Severity of Motor Vehicle Collisions by NYC Borough', size=20)
plt.xlim((-74.26,-73.7))
plt.ylim((40.5,40.92))
plt.xlabel('Longitude', size=20)
plt.ylabel('Latitude', size=20)
plt.savefig('NYCaccidentsByBorough.png')
plt.show()
In [ ]:
 
In [ ]: