dailylog 4-25-20

1 minute read

Get Georgia COVID19 Data by County

  1. Get data from ONE DAY
    1. PROBLEM: They host the data on a totally different webpage
    2. SOLUTION: Use the different webpage in way back machine
  2. Get links from way back calendar view
    1. PROBLEM: Same problem as with worldometer – need to use fake “API”
    2. SOLUTION: (PT2 below)
  3. Hit one link per day

PT 1: Data from one day

import requests
import pandas as pd
from bs4 import BeautifulSoup as bs

url = "https://d20s4vd27d0hk0.cloudfront.net/"

page = requests.get(url)
soup= bs(page.content, "html.parser")

page = requests.get(url)
soup= bs(page.content, "html.parser")
content_block = soup.find(id="summary").find_all('table')

# print(soup)
# print(content_block)

def tableDataText(table):       
    rows = []
    trs = table.find_all('tr')
    headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
    return rows

list_table = tableDataText(content_block[1])
dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
dftable.head(4)

PT2: Datetime stamps from WayBackMachine

get_data_from = "https://d20s4vd27d0hk0.cloudfront.net/"
url = "http://web.archive.org/cdx/search/cdx?url=" + get_data_from
with urllib.request.urlopen(url) as response:
    html = response.read().decode('utf-8')

strings = html.split("cloudfront,d20s4vd27d0hk0)/ ")
strings_dt = [string.split(' ')[0] for string in strings[1:]]
strings_dt

PT3: get 1 per day

def get_one_per_day(ts):
  ts_obj = {}
  for d in ts:
    if d[:8] not in ts_obj:
      ts_obj[d[:8]] = d
  return list(ts_obj.values())
def download_df(one_per_day):
    
    for timestamp in one_per_day:
        
        url = url = "https://web.archive.org/web/"+ timestamp + "/https://d20s4vd27d0hk0.cloudfront.net/"
        page = requests.get(url)
        soup= bs(page.content, "html.parser")
        content_block = soup.find(id="summary").find_all('table')

        def tableDataText(table):       
            rows = []
            trs = table.find_all('tr')
            headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
            if headerow: # if there is a header row include first
                rows.append(headerow)
                trs = trs[1:]
            for tr in trs: # for every table row
                rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
            return rows

        list_table = tableDataText(content_block[1])
        dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
        dftable.head(4)
        dftable['timestamp'] = timestamp
        filename = timestamp + "_COVID19Georgia.csv"
        print(dftable.head())
        dftable.to_csv(filename, index=False)

# url = "https://d20s4vd27d0hk0.cloudfront.net/"
download_df(one_per_day)