dailylog 4-25-20
1 minute read
Get Georgia COVID19 Data by County
- Get data from ONE DAY
- PROBLEM: They host the data on a totally different webpage
- SOLUTION: Use the different webpage in way back machine
- Get links from way back calendar view
- PROBLEM: Same problem as with worldometer – need to use fake “API”
- SOLUTION: (PT2 below)
- Hit one link per day
PT 1: Data from one day
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
url = "https://d20s4vd27d0hk0.cloudfront.net/"
page = requests.get(url)
soup= bs(page.content, "html.parser")
page = requests.get(url)
soup= bs(page.content, "html.parser")
content_block = soup.find(id="summary").find_all('table')
# print(soup)
# print(content_block)
def tableDataText(table):
rows = []
trs = table.find_all('tr')
headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
return rows
list_table = tableDataText(content_block[1])
dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
dftable.head(4)
PT2: Datetime stamps from WayBackMachine
get_data_from = "https://d20s4vd27d0hk0.cloudfront.net/"
url = "http://web.archive.org/cdx/search/cdx?url=" + get_data_from
with urllib.request.urlopen(url) as response:
html = response.read().decode('utf-8')
strings = html.split("cloudfront,d20s4vd27d0hk0)/ ")
strings_dt = [string.split(' ')[0] for string in strings[1:]]
strings_dt
PT3: get 1 per day
def get_one_per_day(ts):
ts_obj = {}
for d in ts:
if d[:8] not in ts_obj:
ts_obj[d[:8]] = d
return list(ts_obj.values())
def download_df(one_per_day):
for timestamp in one_per_day:
url = url = "https://web.archive.org/web/"+ timestamp + "/https://d20s4vd27d0hk0.cloudfront.net/"
page = requests.get(url)
soup= bs(page.content, "html.parser")
content_block = soup.find(id="summary").find_all('table')
def tableDataText(table):
rows = []
trs = table.find_all('tr')
headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
if headerow: # if there is a header row include first
rows.append(headerow)
trs = trs[1:]
for tr in trs: # for every table row
rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
return rows
list_table = tableDataText(content_block[1])
dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
dftable.head(4)
dftable['timestamp'] = timestamp
filename = timestamp + "_COVID19Georgia.csv"
print(dftable.head())
dftable.to_csv(filename, index=False)
# url = "https://d20s4vd27d0hk0.cloudfront.net/"
download_df(one_per_day)