Herroin Overdose Analysis of Cincinnati, Ohio

MUSA 620 Final Project

Shuyan Hong & Jiazuo Zhang

1. Exploratory Analysis

In [1]:
import numpy as np
import pandas as pd
import geopandas as gpd
import folium
from matplotlib import pyplot as plt
import cartopy.crs as ccrs

Import overdose data.

In [2]:
#Overdose data
query = ("https://data.cincinnati-oh.gov/resource/7mtn-nnb5.json?$limit=10000")
raw_data = pd.read_json(query)
raw_data.head()
Out[2]:
address_x agency arrival_time_primary_unit beat closed_time_incident community_council_neighborhood create_time_incident dispatch_time_primary_unit disposition_text district event_number incident_type_desc incident_type_id latitude_x longitude_x priority priority_color
0 W 6TH ST / RACE ST CPD NaN P011 2018-08-18T23:13:00.000 DOWNTOWN 2018-08-18T23:04:31.000 NaN HBF: HANDLED BY FIRE 1.0 CPD180818001582 NaN HEROINP-COMBINED 39.101675 -84.514038 2.0 NaN
1 8XX POPLAR ST CPD 2018-08-18T20:24:30.000 P131 2018-08-18T20:36:04.000 WEST END 2018-08-18T20:19:46.000 2018-08-18T20:21:50.000 HBF: HANDLED BY FIRE 1.0 CPD180818001335 NaN HEROINP-COMBINED 39.115051 -84.529110 2.0 NaN
2 47XX KENARD AV CPD 2018-08-18T19:03:33.000 P531 2018-08-18T21:48:01.000 SPRING GROVE VILLAGE 2018-08-18T19:03:33.000 2018-08-18T19:03:33.000 ARR: ARREST 5.0 CPD180818001232 NaN HEROINP-COMBINED 39.164528 -84.513777 2.0 NaN
3 4XX VOLKERT PL CPD NaN P511 2018-08-18T18:18:34.000 CUF 2018-08-18T18:00:54.000 NaN HBF: HANDLED BY FIRE 5.0 CPD180818001142 NaN HEROINP-COMBINED 39.126231 -84.525635 2.0 NaN
4 42XX FERGUS ST CPD 2018-08-18T16:22:25.000 P531 2018-08-18T16:55:58.000 NORTHSIDE 2018-08-18T16:20:51.000 2018-08-18T16:22:18.000 AST: ASSIST 5.0 CPD180818000986 NaN HEROINP-COMBINED 39.164508 -84.535824 2.0 NaN

Firstly we map the overdose situation of of recent years using Folium.

In [3]:
df0 = raw_data[['latitude_x','longitude_x','incident_type_id']]
df0 = df0.dropna()

def label_race (row):
    return 'red';

df0['color'] = df0.apply (lambda row: label_race(row), axis=1)
In [4]:
# Folium map of Cincinnati
m1 = folium.Map(
    width=700,height=500,
    location=[39.15, -84.55],
    zoom_start=11,
    tiles="Stamen Toner"
)

cityBoundary = 'https://opendata.arcgis.com/datasets/ed78f4754b044ac5815d0a9efe9bb336_1.geojson'
folium.GeoJson(cityBoundary).add_to(m1)


# Heroin overdose points by numbers and incident type
import folium.plugins
#folium.plugins.MarkerCluster()
marker_cluster = folium.plugins.MarkerCluster().add_to(m1)

for i in range(0, len(df0)):
    folium.Marker(
      location=[ df0.iloc[i]['latitude_x'],df0.iloc[i]['longitude_x']],
      popup=df0.iloc[i]['incident_type_id'],
      icon=folium.Icon(color=df0.iloc[i]["color"])
   ).add_to(marker_cluster)
m1
Out[4]:
In [5]:
from folium.plugins import HeatMap
# make a NumPy array (use the "values" attribute)
coordinates = df0[['latitude_x', 'longitude_x']].values

m2 = folium.Map(
    location=[39.15, -84.55],
    zoom_start=11,
    tiles="Stamen Toner"
)
HeatMap(coordinates).add_to(m2)
m2
Out[5]:

Also we can create a hex bin map, and number of overdoses inside each hex bin.

In [6]:
data_valid = raw_data.loc[raw_data.latitude_x.notnull()]
data_valid = data_valid.loc[data_valid.longitude_x.notnull()]
#ignore null longs or lats

from datashader.utils import lnglat_to_meters as webm
data_valid.loc[:, "longitude_x"], data_valid.loc[:, "latitude_x"], = webm(data_valid["longitude_x"],data_valid["latitude_x"])
#change lon, lat to x,y

#bound = gpd.read_file(cityBoundary)
#bound.crs = {'init': 'epsg:3857'}

crs = ccrs.epsg('3857') # web mercator
ax = plt.axes()

# convert to Web Mercator and plot the hexbins 
#X = data_valid.to_crs(epsg=3857)
vals = ax.hexbin(data_valid.longitude_x, data_valid.latitude_x, gridsize=50, mincnt=1, cmap='coolwarm')

ax.set_aspect('equal')

# larger figure
ax.figure.set_size_inches((12, 12))

# add a legend
#and make colorbar fit the size of figure
from mpl_toolkits.axes_grid1 import make_axes_locatable
divider = make_axes_locatable(ax)
cax = divider.append_axes("right", size="5%", pad=0.05)
cbar = plt.colorbar(vals,cax=cax)
cbar.ax.tick_params(labelsize=10)

2. Regression Analysis

By exploring different possible factors that might have impact on numbers of overdoses of each tract, we run the linear regression model to verify this.

2.1 Predictors

In [7]:
#!pip install cenpy
#!pip install pysal
import cenpy as cen
import pysal
import requests
/Users/zhang/anaconda3/envs/musa620/lib/python3.6/site-packages/pysal/model/spvcm/abstracts.py:10: UserWarning: The `dill` module is required to use the sqlite backend fully.
  from .sqlite import head_to_sql, start_sql

Now we read data of some predictors. The data includes 311 requests, black and white people percentage, median income, balchelor percentage, crime level, disatance to vanant properties and distances to transit stations and hospitals.

We use census package to acquire census tract data of Cincinnati, instructions from https://github.com/datamade/census

In [8]:
#!pip install us
from census import Census
from us import states
MY_API_KEY = '97e85eda8a09ad9759d9cbc91a8d3f9eb71b9c0c'
c = Census(MY_API_KEY)
In [9]:
# population of all tracts of Hamilton County, from acs5 
population_raw = c.acs5.state_county_tract('B01003_001E', states.OH.fips, '061', Census.ALL)
population = pd.DataFrame(population_raw)
population = population.rename({'B01003_001E':'population'}, axis=1)

#  bachelor number of all tracts of Hamilton County, from acs5 
bachelor_raw = c.acs5.state_county_tract('B15003_022E', states.OH.fips, '061', Census.ALL)
bachelor = pd.DataFrame(bachelor_raw)
bachelor = bachelor.rename({'B15003_022E':'bachelor'}, axis=1)

#  black population of all tracts of Hamilton County, from acs5 
black_pop_raw = c.acs5.state_county_tract('B02001_003E', states.OH.fips, '061', Census.ALL)
black_pop = pd.DataFrame(black_pop_raw)
black_pop = black_pop.rename({'B02001_003E':'black_pop'}, axis=1)

#  white population of all tracts of Hamilton County, from acs5 
white_pop_raw = c.acs5.state_county_tract('B02001_002E', states.OH.fips, '061', Census.ALL)
white_pop = pd.DataFrame(white_pop_raw)
white_pop = white_pop.rename({'B02001_002E':'white_pop'}, axis=1)

#  median household income of all tracts of Hamilton County, from acs5 
median_hh_raw = c.acs5.state_county_tract('B19013_001E', states.OH.fips, '061', Census.ALL)
median_income = pd.DataFrame(median_hh_raw)
median_income = median_income.rename({'B19013_001E':'median_income'}, axis=1)
In [10]:
df1 = pd.merge(population, bachelor, on='tract', how='outer')
df2 = pd.merge(df1, black_pop, on='tract', how='outer')
df3 = pd.merge(df2, white_pop, on='tract', how='outer')
df4 = pd.merge(df3, median_income, on='tract', how='outer')
In [11]:
# calculte black and white percetage
demographic = df4[['tract','population','bachelor','black_pop','white_pop','median_income']]
demographic['black_pct'] = demographic['black_pop']/demographic['population']
demographic['white_pct'] = demographic['white_pop']/demographic['population']
demographic['bachelor_pct'] = demographic['bachelor']/demographic['population']
/Users/zhang/anaconda3/envs/musa620/lib/python3.6/site-packages/ipykernel_launcher.py:3: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
/Users/zhang/anaconda3/envs/musa620/lib/python3.6/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
/Users/zhang/anaconda3/envs/musa620/lib/python3.6/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """

We want to see how many ovdedose records are inside each census tract these years. The census tracts data contains tracts that beyond the city limit, so we only choose tracts within the city limit.

In [12]:
# intersect city boundary and all census tracts to get census tracts within city boundary
boundary = gpd.read_file(cityBoundary)
cts_raw = gpd.read_file('https://opendata.arcgis.com/datasets/d02e89ac4bfd47caa46c93a056dcbab9_0.geojson')
cts = gpd.overlay(boundary, cts_raw, how='intersection')
In [13]:
# select demographic data of census tracts within city boundary
cts = cts[['TRACTCE10','geometry']]
cts = cts.rename({'TRACTCE10':'tract'}, axis=1)
cts_demographic = pd.merge(cts, demographic, on='tract', how='inner')
In [14]:
cts_demographic.head()
Out[14]:
tract geometry population bachelor black_pop white_pop median_income black_pct white_pct bachelor_pct
0 025104 (POLYGON ((-84.41357452291133 39.0585997697975... 1862.0 585.0 19.0 1749.0 166250.0 0.010204 0.939313 0.314178
1 025002 (POLYGON ((-84.36969552616698 39.0737372659736... 6823.0 1539.0 136.0 6495.0 88979.0 0.019933 0.951927 0.225561
2 004500 (POLYGON ((-84.40989906360086 39.0833943746703... 1034.0 170.0 12.0 997.0 92167.0 0.011605 0.964217 0.164410
3 004603 POLYGON ((-84.37303856194467 39.08145590495828... 3019.0 706.0 84.0 2746.0 66556.0 0.027824 0.909573 0.233852
4 004602 (POLYGON ((-84.37086892135163 39.0859844464269... 4832.0 1039.0 200.0 3919.0 49063.0 0.041391 0.811051 0.215025

Then we calculate number of hospitals and number of schools.

In [15]:
cts1 = cts_demographic

# read hospital geojson
hospitals = gpd.read_file('https://opendata.arcgis.com/datasets/0baa0cdf409e46e1800a6f08081bd9f7_1.geojson')
hospitals.crs = {'init' :'epsg:4326'}

# read school geojson
schools = gpd.read_file('https://opendata.arcgis.com/datasets/97194decd96d451cb9f67c19078b80c9_6.geojson')
schools.crs = {'init' :'epsg:4326'}

iv_dfsjoin1 = gpd.sjoin(cts1,hospitals)# spatil join census tracts and hospitals
hospital_count = iv_dfsjoin1.groupby('tract').count()[['GLOBALID']]
hospital_count = hospital_count.rename({'GLOBALID':'hospital_count'}, axis=1)

iv_dfsjoin2 = gpd.sjoin(cts1,schools)# spatil join census tracts and schools
school_count = iv_dfsjoin2.groupby('tract').count()[['GLOBALID']]
school_count = school_count.rename({'GLOBALID':'school_count'}, axis=1)

2.2 Dependent Variable

In [16]:
new_cts = cts_demographic
In [17]:
#create geometry from df1
from shapely.geometry import Point
points = gpd.GeoDataFrame(df0, geometry=[Point(x, y) for x, y in zip(df0.longitude_x, df0.latitude_x)])

points.head()
Out[17]:
latitude_x longitude_x incident_type_id color geometry
0 39.101675 -84.514038 HEROINP-COMBINED red POINT (-84.514038 39.1016750001196)
1 39.115051 -84.529110 HEROINP-COMBINED red POINT (-84.52911 39.1150510001195)
2 39.164528 -84.513777 HEROINP-COMBINED red POINT (-84.513777 39.1645280001195)
3 39.126231 -84.525635 HEROINP-COMBINED red POINT (-84.52563499999999 39.1262310001196)
4 39.164508 -84.535824 HEROINP-COMBINED red POINT (-84.53582400000001 39.1645080001195)
In [18]:
points.crs = {'init' :'epsg:4326'}
dv_dfsjoin = gpd.sjoin(new_cts,points)# spatil join census tracts and overdose points
In [19]:
# count overdose points inside each tract
od_count = dv_dfsjoin.groupby('tract').count()[['index_right']]
od_count = od_count.rename({'index_right':'od_count'}, axis=1)
od_count.head()
Out[19]:
od_count
tract
000200 18
000700 268
000900 145
001000 55
001100 86

Now merge all dependent and independent variables, and calculate overdose rate by od_count divided by population inside each tract.

In [20]:
merge1 = pd.merge(cts_demographic, hospital_count, on='tract', how='outer')
merge2 = pd.merge(merge1, school_count, on='tract', how='outer')
final_df = pd.merge(merge2, od_count, on='tract', how='outer')
final_df['od_rate'] = final_df['od_count']/final_df['population']
final_df = final_df.fillna(0)
In [21]:
final_df
Out[21]:
tract geometry population bachelor black_pop white_pop median_income black_pct white_pct bachelor_pct hospital_count school_count od_count od_rate
0 025104 (POLYGON ((-84.41357452291133 39.0585997697975... 1862.0 585.0 19.0 1749.0 166250.0 0.010204 0.939313 0.314178 0.0 0.0 0.0 0.000000
1 025002 (POLYGON ((-84.36969552616698 39.0737372659736... 6823.0 1539.0 136.0 6495.0 88979.0 0.019933 0.951927 0.225561 0.0 0.0 0.0 0.000000
2 004500 (POLYGON ((-84.40989906360086 39.0833943746703... 1034.0 170.0 12.0 997.0 92167.0 0.011605 0.964217 0.164410 0.0 0.0 4.0 0.003868
3 004603 POLYGON ((-84.37303856194467 39.08145590495828... 3019.0 706.0 84.0 2746.0 66556.0 0.027824 0.909573 0.233852 0.0 1.0 6.0 0.001987
4 004602 (POLYGON ((-84.37086892135163 39.0859844464269... 4832.0 1039.0 200.0 3919.0 49063.0 0.041391 0.811051 0.215025 0.0 3.0 45.0 0.009313
5 025001 (POLYGON ((-84.36957073546399 39.0749964865464... 6260.0 1548.0 62.0 6028.0 82868.0 0.009904 0.962939 0.247284 0.0 0.0 0.0 0.000000
6 004605 (POLYGON ((-84.41169303481355 39.1008944955807... 2516.0 492.0 237.0 2095.0 58650.0 0.094197 0.832671 0.195548 0.0 0.0 22.0 0.008744
7 004604 POLYGON ((-84.39448364121984 39.10524757245881... 3887.0 481.0 288.0 3329.0 51431.0 0.074093 0.856445 0.123746 0.0 0.0 62.0 0.015951
8 004702 POLYGON ((-84.40476674756641 39.13493087559406... 780.0 83.0 13.0 752.0 38646.0 0.016667 0.964103 0.106410 0.0 0.0 57.0 0.073077
9 026600 POLYGON ((-84.46571383669516 39.12193007521013... 1608.0 432.0 141.0 1404.0 70250.0 0.087687 0.873134 0.268657 0.0 1.0 82.0 0.050995
10 004701 POLYGON ((-84.42808849604683 39.12703657693825... 3196.0 1101.0 53.0 3077.0 95488.0 0.016583 0.962766 0.344493 0.0 2.0 6.0 0.001877
11 004200 POLYGON ((-84.46444366483048 39.12983595709321... 1836.0 518.0 424.0 1300.0 53056.0 0.230937 0.708061 0.282135 0.0 1.0 11.0 0.005991
12 004900 POLYGON ((-84.4389178289271 39.11773602336285,... 6487.0 1859.0 269.0 5818.0 94868.0 0.041468 0.896871 0.286573 0.0 2.0 13.0 0.002004
13 024901 (POLYGON ((-84.3970721740594 39.12196931007487... 1075.0 122.0 0.0 1065.0 57500.0 0.000000 0.990698 0.113488 0.0 0.0 0.0 0.000000
14 004800 POLYGON ((-84.40569287293694 39.13825103319869... 3426.0 1054.0 9.0 3196.0 119470.0 0.002627 0.932866 0.307647 0.0 1.0 3.0 0.000876
15 004100 POLYGON ((-84.45843631676497 39.1343519865754,... 1615.0 381.0 728.0 800.0 59643.0 0.450774 0.495356 0.235913 0.0 4.0 9.0 0.005573
16 003900 POLYGON ((-84.4724115280404 39.13872552367319,... 1951.0 108.0 1593.0 350.0 22566.0 0.816504 0.179395 0.055356 0.0 0.0 9.0 0.004613
17 005100 POLYGON ((-84.40851084594802 39.14021088964427... 2410.0 626.0 79.0 2258.0 94145.0 0.032780 0.936929 0.259751 0.0 1.0 3.0 0.001245
18 004000 POLYGON ((-84.46725090467903 39.14695878206831... 2372.0 320.0 1546.0 648.0 42949.0 0.651771 0.273187 0.134907 0.0 1.0 10.0 0.004216
19 024700 (POLYGON ((-84.40220717113631 39.1347695617024... 1707.0 263.0 59.0 1615.0 52297.0 0.034564 0.946104 0.154071 0.0 0.0 0.0 0.000000
20 003800 POLYGON ((-84.47189734296572 39.14697704190844... 1940.0 140.0 1753.0 181.0 22788.0 0.903608 0.093299 0.072165 0.0 2.0 26.0 0.013402
21 005000 (POLYGON ((-84.45444775941291 39.1459729946670... 5049.0 1406.0 339.0 4227.0 69000.0 0.067142 0.837195 0.278471 0.0 3.0 13.0 0.002575
22 024800 POLYGON ((-84.38558243035754 39.14844630157257... 3404.0 907.0 5.0 3234.0 96106.0 0.001469 0.950059 0.266451 0.0 0.0 0.0 0.000000
23 005302 POLYGON ((-84.40451981774257 39.14973722544627... 3249.0 923.0 304.0 2720.0 66250.0 0.093567 0.837181 0.284087 0.0 0.0 11.0 0.003386
24 005301 POLYGON ((-84.42516472407397 39.15575951288675... 3587.0 1071.0 475.0 2974.0 56563.0 0.132423 0.829105 0.298578 0.0 2.0 10.0 0.002788
25 025402 (POLYGON ((-84.44034958036744 39.1557611335698... 3052.0 580.0 265.0 2693.0 60741.0 0.086828 0.882372 0.190039 0.0 0.0 0.0 0.000000
26 005200 POLYGON ((-84.44279399384786 39.14528821560646... 3535.0 1084.0 196.0 3101.0 69318.0 0.055446 0.877228 0.306648 0.0 1.0 10.0 0.002829
27 025600 (POLYGON ((-84.45944560213331 39.1464920393789... 3523.0 574.0 311.0 3129.0 44500.0 0.088277 0.888163 0.162929 0.0 0.0 0.0 0.000000
28 005600 POLYGON ((-84.37905156837199 39.16008944046915... 5790.0 951.0 1928.0 3575.0 46105.0 0.332988 0.617444 0.164249 0.0 2.0 20.0 0.003454
29 025401 POLYGON ((-84.43934752457294 39.15653210139998... 1986.0 385.0 129.0 1824.0 43402.0 0.064955 0.918429 0.193857 0.0 0.0 0.0 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
120 020901 (POLYGON ((-84.62305139318956 39.1459676064429... 3539.0 298.0 299.0 3020.0 45556.0 0.084487 0.853348 0.084205 0.0 0.0 2.0 0.000565
121 006500 POLYGON ((-84.46604105823579 39.16735990757773... 5894.0 445.0 2953.0 2764.0 46462.0 0.501018 0.468951 0.075501 0.0 2.0 54.0 0.009162
122 008502 POLYGON ((-84.55265169991918 39.16421849726655... 2094.0 30.0 1819.0 143.0 7044.0 0.868672 0.068290 0.014327 0.0 1.0 7.0 0.003343
123 020902 (POLYGON ((-84.60275325565836 39.1618617903557... 5548.0 603.0 876.0 4548.0 40176.0 0.157895 0.819755 0.108688 0.0 0.0 0.0 0.000000
124 010100 POLYGON ((-84.60815706077847 39.15113837434231... 5359.0 559.0 2946.0 2185.0 37884.0 0.549729 0.407725 0.104311 0.0 1.0 80.0 0.014928
125 007800 POLYGON ((-84.54501968760005 39.16714455545117... 2538.0 459.0 757.0 1639.0 38571.0 0.298266 0.645784 0.180851 0.0 2.0 58.0 0.022853
126 007400 POLYGON ((-84.52894142414301 39.16157064925422... 1716.0 373.0 281.0 1350.0 48750.0 0.163753 0.786713 0.217366 0.0 0.0 49.0 0.028555
127 007900 POLYGON ((-84.54346057775746 39.17476733751921... 1697.0 285.0 819.0 848.0 33083.0 0.482616 0.499705 0.167943 0.0 0.0 29.0 0.017089
128 025800 (POLYGON ((-84.48322645115884 39.1687728134858... 4366.0 421.0 773.0 3406.0 51029.0 0.177050 0.780119 0.096427 0.0 0.0 0.0 0.000000
129 020802 (POLYGON ((-84.58681466060455 39.1642494579820... 5310.0 970.0 461.0 4596.0 67083.0 0.086817 0.865537 0.182674 0.0 0.0 0.0 0.000000
130 008501 POLYGON ((-84.58258361702951 39.16988658576086... 3061.0 163.0 2202.0 773.0 18326.0 0.719373 0.252532 0.053251 0.0 0.0 43.0 0.014048
131 007500 POLYGON ((-84.54216972568472 39.18434683094865... 2121.0 359.0 498.0 1557.0 47647.0 0.234795 0.734088 0.169260 0.0 1.0 13.0 0.006129
132 006400 POLYGON ((-84.48527940902325 39.1727951596445,... 3382.0 272.0 3112.0 195.0 30748.0 0.920166 0.057658 0.080426 0.0 4.0 29.0 0.008575
133 007300 POLYGON ((-84.50562540689216 39.17536341551688... 1967.0 208.0 964.0 918.0 43450.0 0.490086 0.466701 0.105745 0.0 3.0 83.0 0.042196
134 020811 (POLYGON ((-84.57191468719508 39.1851453068084... 6826.0 840.0 1867.0 4603.0 64904.0 0.273513 0.674333 0.123059 0.0 0.0 11.0 0.001611
135 025700 (POLYGON ((-84.48190898514163 39.1888039171248... 1965.0 66.0 373.0 1405.0 30089.0 0.189822 0.715013 0.033588 0.0 0.0 0.0 0.000000
136 008400 POLYGON ((-84.54611722737476 39.18881689934389... 2187.0 196.0 1326.0 812.0 31875.0 0.606310 0.371285 0.089620 0.0 2.0 15.0 0.006859
137 008000 POLYGON ((-84.52089703660116 39.20122255647082... 5204.0 157.0 4260.0 643.0 11600.0 0.818601 0.123559 0.030169 0.0 1.0 21.0 0.004035
138 008100 POLYGON ((-84.53468716564535 39.20223882375489... 2752.0 210.0 2119.0 567.0 30526.0 0.769985 0.206032 0.076308 0.0 1.0 14.0 0.005087
139 006100 POLYGON ((-84.49410091979443 39.20130140101698... 2567.0 137.0 564.0 1555.0 31726.0 0.219712 0.605765 0.053370 0.0 1.0 153.0 0.059603
140 008202 POLYGON ((-84.55883515146103 39.20364065673207... 3279.0 474.0 2087.0 1120.0 44635.0 0.636475 0.341568 0.144556 0.0 3.0 20.0 0.006099
141 008300 POLYGON ((-84.58037897116463 39.19253946427716... 4178.0 429.0 2323.0 1699.0 45969.0 0.556008 0.406654 0.102681 2.0 1.0 52.0 0.012446
142 008201 POLYGON ((-84.55763442709959 39.20357901006025... 4095.0 479.0 2472.0 1237.0 42595.0 0.603663 0.302076 0.116972 0.0 2.0 18.0 0.004396
143 021900 (POLYGON ((-84.55760923423578 39.2038488125994... 1583.0 117.0 1054.0 523.0 33846.0 0.665824 0.330385 0.073910 0.0 0.0 0.0 0.000000
144 022200 (POLYGON ((-84.47438418424967 39.2098562541126... 4481.0 857.0 1110.0 3320.0 56321.0 0.247713 0.740906 0.191252 0.0 0.0 0.0 0.000000
145 011100 (POLYGON ((-84.54135362260716 39.2126544630221... 3357.0 432.0 1716.0 1505.0 56620.0 0.511171 0.448317 0.128686 0.0 0.0 7.0 0.002085
146 021802 (POLYGON ((-84.55104971477819 39.2093521842509... 2917.0 293.0 1273.0 1369.0 51071.0 0.436407 0.469318 0.100446 0.0 0.0 0.0 0.000000
147 021801 (POLYGON ((-84.53274294648506 39.2169644639702... 6405.0 676.0 3476.0 2764.0 42371.0 0.542701 0.431538 0.105543 0.0 0.0 0.0 0.000000
148 022102 POLYGON ((-84.53107073060511 39.21680448689365... 6626.0 871.0 2120.0 3656.0 64831.0 0.319952 0.551766 0.131452 0.0 0.0 0.0 0.000000
149 022601 (POLYGON ((-84.47363437549781 39.2176530190985... 6170.0 1191.0 664.0 4995.0 133468.0 0.107618 0.809562 0.193031 0.0 0.0 0.0 0.000000

150 rows × 14 columns

2.3 Regression

Firstly we can create some interactive maps showing predictors and overdose rates using hvplot.

In [22]:
import hvplot.pandas

Now we do Linear Regression.

First we want to see the relationship of single predictors and the overdose rate.

Then we do the multivariable linear regression. We start by looking at the correlation matrix of predictors.

It is not surprising that median income, percent of whites and percent of bachelors are strongly assoviated.

As we can see from the summary, the overdose rate is positively related with percent of blacks, percent of whites, number of hospitals and number of schools, but negatvely associated wth median household income.

3. Clustering Analysis

Looking at how overdoses of different tracts in nearby neighborhoods have spatial relationships, whether they are clustered, or disparted, or randomly distributed.

We will create maps not only showing the clustering facts, but also give them interactive surfaces to generate detailed information of selected tract, such as population, median household income and bechalor number.

In [31]:
import altair as alt
from vega_datasets import data
alt.renderers.enable("notebook")
Out[31]:
RendererRegistry.enable('notebook')
In [32]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

DBSCAN

In [33]:
from sklearn.cluster import dbscan 
In [34]:
data2 = raw_data[['create_time_incident','latitude_x', 'longitude_x']]
data2 = data2.dropna()
In [35]:
# some parameters to start with
eps = 200 # in meters, maximum distance between two samples for one to be considered as in the neighborhood of the other
min_samples = 5 # number of samples (or total weight) in a neighborhood for a point to be considered as a core point

from datashader.utils import lnglat_to_meters as webm
data2.loc[:, "longitude_x"], data2.loc[:, "latitude_x"], = webm(data2["longitude_x"],data2["latitude_x"])
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data2[['latitude_x', 'longitude_x']])

cores, labels = dbscan(data2[['longitude_x', 'latitude_x']], 
                       eps=eps, min_samples=min_samples)

# add our labels to the original data
data2['label'] = labels
# The number of clusters is the number of unique labels minus one (because noise has a label of -1)
num_clusters = data2['label'].nunique() - 1
print("number of clusters = ", num_clusters)
number of clusters =  140
In [36]:
print(len(cores))
4593
In [37]:
# Setup figure and axis
f, ax = plt.subplots(1, figsize=(10, 7), facecolor='black')

# Plot the noise samples in grey
noise = data2.loc[data2['label']==-1]
ax.scatter(noise['longitude_x'], noise['latitude_x'], c='grey', s=5, linewidth=0)

# loop over each cluster number
for label_num in range(num_clusters):
    
    # extract the samples with this label number
    this_cluster = data2.loc[data2['label'] == label_num]
    
    # plot the centroids
    ax.scatter(this_cluster['longitude_x'], this_cluster['latitude_x'], 
                linewidth=0, color='red')

# format and show
ax.set_axis_off()
plt.show()
In [38]:
N = data2.groupby('label').size()


# sort from largest to smallest
N = N.sort_values(ascending=False)
print(N)

# extract labels (ignoring label -1 for noise)
N2 = N.drop(labels=[-1])
top5 = list(N2.iloc[0:5].index)
print(top5)
label
 0      1189
-1      1005
 4       984
 9       204
 36      197
 45      153
 5       125
 13      104
 30       75
 20       74
 8        71
 3        70
 2        64
 39       62
 22       58
 33       47
 92       46
 48       36
 67       34
 96       33
 21       33
 47       32
 1        32
 10       30
 11       29
 101      28
 38       28
 123      27
 19       27
 98       24
        ... 
 68        6
 106       6
 66        6
 132       6
 100       6
 107       6
 94        6
 112       6
 133       5
 131       5
 128       5
 135       5
 136       5
 130       5
 97        5
 126       5
 25        5
 120       5
 118       5
 111       5
 109       5
 99        5
 54        5
 55        5
 138       5
 78        5
 85        5
 102       5
 139       5
 119       4
Length: 141, dtype: int64
[0, 4, 9, 36, 45]

4. Time-spatial Analysis

Looking at how overdose of each tract changes over the last three years (from 2016 to 2018) by creating a GIF image highlighting the concentrated tracts. The methods refer to Homework 5.

4.1 Overdose by months

In [40]:
import zipfile
import os
import urllib
import dask.dataframe as dd
import datashader as ds
import datashader.transfer_functions as tf

from datashader.colors import Greys9, inferno
In [41]:
# extract occur hour
data2['create_time_incident'] = pd.to_datetime(data2['create_time_incident'])
data2['month'] = data2.create_time_incident.dt.month

4.2 Overdose by years

In [45]:
data3 = data2
data3['create_time_incident'] = pd.to_datetime(data3['create_time_incident'])
data3['year'] = data3.create_time_incident.dt.year

4.3 Overdose trend

We can make plot of overdose counts by months and date.

In [48]:
data4 = data3.groupby(['year','month']).count()[['create_time_incident']]
In [49]:
data4 = data4.rename({'create_time_incident':'count'}, axis=1)
data4 = data4.reset_index()
data4
Out[49]:
year month count
0 2015 7 37
1 2015 8 113
2 2015 9 96
3 2015 10 104
4 2015 11 108
5 2015 12 91
6 2016 1 88
7 2016 2 90
8 2016 3 82
9 2016 4 88
10 2016 5 112
11 2016 6 86
12 2016 7 124
13 2016 8 205
14 2016 9 305
15 2016 10 164
16 2016 11 86
17 2016 12 87
18 2017 1 133
19 2017 2 241
20 2017 3 333
21 2017 4 297
22 2017 5 283
23 2017 6 266
24 2017 7 204
25 2017 8 240
26 2017 9 229
27 2017 10 188
28 2017 11 187
29 2017 12 138
30 2018 1 143
31 2018 2 143
32 2018 3 141
33 2018 4 147
34 2018 5 127
35 2018 6 186
36 2018 7 242
37 2018 8 126
In [78]:
data4.hvplot(x='month',groupby='year',kind='bar')
Out[78]:
In [84]:
data5 = data3
data5['date'] = data3.create_time_incident.dt.date
data5 = data5.groupby(['year','month','date']).count()[['create_time_incident']]
data5 = data5.rename({'create_time_incident':'count'}, axis=1)
data5 = data5.reset_index()
data5.hvplot(x='date',y='count',kind='bar')
Out[84]:

5. The Online Visualization

In [51]:
import numpy as np
import pandas as pd
import geopandas as gpd
import requests

import holoviews as hv
import param as pm
import panel as pn

import altair as alt
import folium
from folium.plugins import HeatMap
In [52]:
hv.extension('bokeh')
pn.extension('vega')
alt.renderers.enable('notebook');
In [122]:
MAX = 12
MIN = 1

# store the start/end dates of our data set
MIN_DATE = data2['month'].min()
MAX_DATE = data2['month'].max()

# store the default min/max date
DEFAULT_BOUNDS = (MIN_DATE, MAX_DATE)
In [123]:
data6 = raw_data[['create_time_incident','latitude_x', 'longitude_x']]
data6 = data6.dropna()
data6['create_time_incident'] = pd.to_datetime(data6['create_time_incident'])
data6['month'] = data3.create_time_incident.dt.month
In [130]:
class CincinnatiOverdoseApp(pm.Parameterized):
    """
    An app visualizing recent overdoses in Cincinnati. 
    
    It includes:
    - a Folium heatmap
    - a map showing overdose rates and demographic data
    - a Hvplot line chart

    """
    
    # the number of days to get data for
    month = pm.Integer(default=7, bounds=(MIN, MAX))
    
    # the x-axis selection on the daily shootings chart
    timeFilter = hv.streams.BoundsX(boundsx=DEFAULT_BOUNDS)
    
    
    def filter_by_month(self):
        """
        Return the subset of the full data set ('data2') that 
        occurred in the selected month.
        """
        # select data based on month
        selection = data6[data6['month'] == self.month]  

        # only return subset of data that is necessary
        return selection
    
    @pm.depends("month", watch=True)
    def _reset_timeFilter(self):
        """
        Internal function that resets the time filter to the default limits
        """
        self.timeFilter.update(boundsx=DEFAULT_BOUNDS)
    

    @pm.depends("month")
    def heatmap(self):
        """
        Return a Folium map with a heatmap showing the currently 
        selected data.
        """
        # initialize the Folium map
        m3 = folium.Map(
                location=[39.15, -84.55],
                zoom_start=11,
                tiles="Stamen Toner"
        )

        # get the filtered data
        data = self.filter_by_month()

        # convert to coordinates array
        coordinates = data[['latitude_x', 'longitude_x']].values

        # add heat map
        folium.GeoJson(cityBoundary).add_to(m3)
        HeatMap(coordinates).add_to(m3)
        # IMPORTANT: add map to a folium Figure
        # return a figure with a set width/height
        figure = folium.Figure(width=700, height=400)
        m3.add_to(figure)

        # return the Pane object
        return pn.Pane(figure)
    
    
    def hvplotmap(self):
        figure = final_df.hvplot(c='od_rate', width=700, height=400, dynamic=False,  
                                 cmap='coolwarm' , title = 'Overdose Rate (2015-2018)')

        # return the Pane object
        return pn.Pane(figure)
    
    def barplot(self):
        figure = data5.hvplot(x='date',y='count',kind='line',width = 1400,title = 'Overdose Count by date')

        # return the Pane object
        return pn.Pane(figure)
    
  
In [131]:
app = CincinnatiOverdoseApp(name="")

# Create some HTML elements
title = pn.Pane("<h2>Heroin Overdose in Cincinnati</h2>", width=500)
instructions = pn.Pane(
    """
<div font-size=28px><b>Note:</b> From he summer of 2015 to the summer of 2018, there are more than
6000 heroin overdose reports in Cincinnati, Ohio. This page shows situation and trend of heroin overdose in that city.</div>""",
    width=500,
)

panel = pn.Column(
    pn.Row(pn.Column(pn.Row(title))),
    pn.Row(pn.Row(app.param),instructions, align="center"),
    pn.Row(app.heatmap,app.hvplotmap, align="center"),
    pn.Row(app.barplot)
)

panel.servable()
Out[131]:
In [ ]: