import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
page = requests.get(url)
soup= bs(page.content, "html.parser")
# url = "https://dph.georgia.gov/covid-19-daily-status-report"
def download_df(url):
page = requests.get(url)
soup= bs(page.content, "html.parser")
content_block = soup.find(id="summary").find_all('table')
def tableDataText(table):
rows = []
trs = table.find_all('tr')
headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
return rows
list_table = tableDataText(content_block[1])
dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
dftable.head(4)
# dftable['timestamp'] = timestamp
# filename = "drive/My Drive/data/worldometer/" + timestamp + "_worlometer.csv"
print(dftable.head())
dftable.to_csv('georgia_covid19.csv', index=False)
url = "https://d20s4vd27d0hk0.cloudfront.net/"
download_df(url)
def get_timestamps(site):
url = "http://web.archive.org/cdx/search/cdx?url=" + site
with urllib.request.urlopen(url) as response:
html = response.read().decode('utf-8')
# Must customize this part per website
strings = html.split("cloudfront,d20s4vd27d0hk0)/ ")
strings_dt = [string.split(' ')[0] for string in strings[1:]]
def get_one_per_day(ts):
ts_obj = {}
for d in ts:
if d[:8] not in ts_obj:
ts_obj[d[:8]] = d
return list(ts_obj.values())
ts = get_timestamps("https://d20s4vd27d0hk0.cloudfront.net/")
one_per_day = get_one_per_day(strings_dt)
# download_df(one_per_day)
# one_per_day
def download_df(one_per_day):
# url = "https://dph.georgia.gov/covid-19-daily-status-report"
def download_df(one_per_day):
for timestamp in one_per_day:
url = url = "https://web.archive.org/web/"+ timestamp + "/https://d20s4vd27d0hk0.cloudfront.net/"
page = requests.get(url)
soup= bs(page.content, "html.parser")
content_block = soup.find(id="summary").find_all('table')
def tableDataText(table):
rows = []
trs = table.find_all('tr')
headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
return rows
list_table = tableDataText(content_block[1])
dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
dftable.head(4)
dftable['timestamp'] = timestamp
filename = timestamp + "_COVID19Georgia.csv"
print(dftable.head())
dftable.to_csv(filename, index=False)
# url = "https://d20s4vd27d0hk0.cloudfront.net/"
download_df(one_per_day)
import glob, os
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('georgia', "*.csv"))))
df['timestamp'] = df['timestamp'].astype(str)
df['date'] = df.apply(lambda x: x['timestamp'][:4] + '-' + x['timestamp'][4:6] + '-' + x['timestamp'][6:8], axis=1)
df.head()
df.columns = ['county', 'cases', 'deaths', 'ts', 'date']
## PREPPING DATA FOR OG D3 RACES
train_group = pd.DataFrame(df.groupby(['county','date'])['cases'].sum())
train_group.reset_index(inplace = True)
train_group.columns = ['name', 'date', 'value']
## GETTING ONLY MARCH DATA
# train_group = train_group[train_group['date'] >= '2020-03-01']
reshaped_df = pd.DataFrame()
for country in list(set(train_group['name'])):
df = train_group[train_group['name'] == country]
new_df = df.pivot_table('value', 'name', 'date')
reshaped_df = reshaped_df.append(new_df)
reshaped_df.reset_index(inplace=True)
reshaped_df
reshaped_df.to_csv('all_georgia.csv', index=False)
df = pd.read_csv('all_georgia.csv')
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
counties['features']['']
set(list(df['name']))
fips = pd.read_csv('county_fips.csv')
fips_ga = fips[fips['State'] == 'GA']
fips_ga_dict = dict(zip(fips_ga.Name,fips_ga.FIPS))
fips_ga_dict
fips = pd.read_csv('county_fips.csv')
fips_ga = fips[fips['State'] == 'GA']
fips_ga_dict = dict(zip(fips_ga.Name,fips_ga.FIPS))
fips_ga_dict
def get_fips(county):
try:
return fips_ga_dict[county]
except:
return 'na'
df['fips'] = df.apply(lambda x: get_fips(x['name']), axis=1)
df.head()
df.columns[-2:]]
df_sm.head()
df_sm.to_csv('georgia_latest.csv', index=False)
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
dtype={"fips": str})
import plotly.express as px
fig = px.choropleth(df, geojson=counties, locations='fips', color='unemp',
color_continuous_scale="Viridis",
range_color=(0, 12),
scope="usa",
labels={'unemp':'unemployment rate'}
)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
import pandas as pd
g = pd.read_csv('georgia_latest.csv')
# df['2020-04-25'].max()
g.columns[0]
date = '2020-04-25'
fig = px.choropleth(df, geojson=counties, locations='fips', color=date,
color_continuous_scale="Viridis",
range_color=(0, 500),
scope="usa",
labels={'2020-04-25':'infection rate'}
)
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, title="hello")
fig.update_layout(
title={
'text': date,
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.show()
df = pd.read_csv('all_georgia.csv')
def get_fips(county):
try:
return fips_ga_dict[county]
except:
return 'na'
df['fips'] = df.apply(lambda x: get_fips(x['name']), axis=1)
def make_plot(df):
date = df.columns[0]
metric = df.columns[1]
fig = px.choropleth(df, geojson=counties, locations='fips', color=metric,
color_continuous_scale="Viridis",
range_color=(0, 100),
scope="usa",
labels={'2020-04-25':'new'}
)
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, title=date)
filename = 'georgianew' + date + '.png'
fig.update_layout(
title={
'text': date,
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'})
fig.write_image(filename)
fig.show()
for col in df.columns[1:3]:
# for col in df.columns[1:-1]:
sm_df = df[[col, 'fips']]
sm_df['new'] = sm_df[col].diff()
sm_df = sm_df[[col, 'new', 'fips']]
print(sm_df)
# make_plot(sm_df)
import pandas as pd
import plotly.express as px
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
counties = json.load(response)
fips = pd.read_csv('county_fips.csv')
fips_ga = fips[fips['State'] == 'GA']
fips_ga_dict = dict(zip(fips_ga.Name,fips_ga.FIPS))
fips_ga_dict
df = pd.read_csv('all_georgia.csv')
def get_fips(county):
try:
return fips_ga_dict[county]
except:
return 'na'
df['fips'] = df.apply(lambda x: get_fips(x['name']), axis=1)
def make_plot(df):
date = df.columns[0]
fig = px.choropleth(df, geojson=counties, locations='fips', color=date,
color_continuous_scale="Viridis",
range_color=(0, 2500),
scope="usa",
labels={'2020-04-25':'infected'}
)
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.update_layout(
title={
'text': date,
'y':0.9,
'x':0.5,
'xanchor': 'center',
'yanchor': 'top'},
coloraxis_colorbar=dict(
title="Number of infections",
titleside="top",
tickmode="array",
tickvals=[0, 500, 1000, 1500, 2000, 2500],
ticktext=["0", "500", "1000", "1500", "2000", "2500"],
ticks="outside"
)
)
filename = 'georgia_v5' + date + '.png'
fig.write_image(filename)
fig.show()
# for col in df.columns[1:3]:
for col in df.columns[1:-1]:
sm_df = df[[col, 'fips']]
make_plot(sm_df)