1. Get data from sheet
  2. Get links from way back calendar view
  3. Hit one link per day
In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
In [66]:
page = requests.get(url)
soup= bs(page.content, "html.parser")
In [71]:
# url = "https://dph.georgia.gov/covid-19-daily-status-report"
def download_df(url):
    page = requests.get(url)
    soup= bs(page.content, "html.parser")
    content_block = soup.find(id="summary").find_all('table')

    def tableDataText(table):       
        rows = []
        trs = table.find_all('tr')
        headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
        if headerow: # if there is a header row include first
            rows.append(headerow)
            trs = trs[1:]
        for tr in trs: # for every table row
            rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
        return rows

    list_table = tableDataText(content_block[1])
    dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
    dftable.head(4)
#     dftable['timestamp'] = timestamp
#     filename = "drive/My Drive/data/worldometer/" + timestamp + "_worlometer.csv"
    print(dftable.head())
    dftable.to_csv('georgia_covid19.csv', index=False)

url = "https://d20s4vd27d0hk0.cloudfront.net/"
download_df(url)
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths
0                              Fulton      2543         91
1                              Dekalb      1788         36
2                           Dougherty      1470        108
3                            Gwinnett      1460         46
4                                Cobb      1395         74

GET DATA FROM WAYBACK MACHINE

  • Get all urls
In [63]:
def get_timestamps(site):    
    url = "http://web.archive.org/cdx/search/cdx?url=" + site
    with urllib.request.urlopen(url) as response:
        html = response.read().decode('utf-8')

#     Must customize this part per website    
    strings = html.split("cloudfront,d20s4vd27d0hk0)/ ")
    strings_dt = [string.split(' ')[0] for string in strings[1:]]
In [64]:
def get_one_per_day(ts):
    ts_obj = {}
    for d in ts:
        if d[:8] not in ts_obj:
            ts_obj[d[:8]] = d
    return list(ts_obj.values())

ts = get_timestamps("https://d20s4vd27d0hk0.cloudfront.net/")
one_per_day = get_one_per_day(strings_dt)
# download_df(one_per_day)
# one_per_day

def download_df(one_per_day):
    
Out[64]:
['20200331042732',
 '20200401155500',
 '20200402130332',
 '20200403001209',
 '20200404003353',
 '20200405161037',
 '20200406025628',
 '20200407161042',
 '20200408001918',
 '20200409025326',
 '20200410161045',
 '20200411161053',
 '20200412013948',
 '20200413161051',
 '20200414004432',
 '20200415022915',
 '20200416021759',
 '20200417025925',
 '20200418014416',
 '20200419161040',
 '20200420135015',
 '20200421025924',
 '20200422161044',
 '20200423051926',
 '20200425203855']
In [76]:
# url = "https://dph.georgia.gov/covid-19-daily-status-report"
def download_df(one_per_day):
    
    for timestamp in one_per_day:
        
        url = url = "https://web.archive.org/web/"+ timestamp + "/https://d20s4vd27d0hk0.cloudfront.net/"
        page = requests.get(url)
        soup= bs(page.content, "html.parser")
        content_block = soup.find(id="summary").find_all('table')

        def tableDataText(table):       
            rows = []
            trs = table.find_all('tr')
            headerow = [td.get_text(strip=True) for td in trs[0].find_all('th')] # header row
            if headerow: # if there is a header row include first
                rows.append(headerow)
                trs = trs[1:]
            for tr in trs: # for every table row
                rows.append([td.get_text(strip=True) for td in tr.find_all('td')]) # data row
            return rows

        list_table = tableDataText(content_block[1])
        dftable = pd.DataFrame(list_table[1:], columns=list_table[0])
        dftable.head(4)
        dftable['timestamp'] = timestamp
        filename = timestamp + "_COVID19Georgia.csv"
        print(dftable.head())
        dftable.to_csv(filename, index=False)

# url = "https://d20s4vd27d0hk0.cloudfront.net/"
download_df(one_per_day)
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton       503         16  20200331042732
1                              Dekalb       294          3  20200331042732
2                           Dougherty       278         18  20200331042732
3                                Cobb       250         11  20200331042732
4                            Gwinnett       178          2  20200331042732
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton       624         19  20200401155500
1                           Dougherty       480         27  20200401155500
2                              Dekalb       365          4  20200401155500
3                                Cobb       293         15  20200401155500
4                            Gwinnett       250          4  20200401155500
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton       638         20  20200402130332
1                           Dougherty       490         29  20200402130332
2                              Dekalb       373          5  20200402130332
3                                Cobb       304         15  20200402130332
4                            Gwinnett       257          6  20200402130332
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton       747         23  20200403001209
1                           Dougherty       521         30  20200403001209
2                              Dekalb       409          8  20200403001209
3                                Cobb       341         17  20200403001209
4                            Gwinnett       303          8  20200403001209
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton       910         26  20200404003353
1                           Dougherty       607         30  20200404003353
2                              Dekalb       483          8  20200404003353
3                                Cobb       422         20  20200404003353
4                            Gwinnett       353          8  20200404003353
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton       962         27  20200405161037
1                           Dougherty       686         30  20200405161037
2                              Dekalb       543          9  20200405161037
3                                Cobb       456         23  20200405161037
4                            Gwinnett       408          7  20200405161037
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton       970         28  20200406025628
1                           Dougherty       688         31  20200406025628
2                              Dekalb       549          9  20200406025628
3                                Cobb       474         24  20200406025628
4                            Gwinnett       410          7  20200406025628
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1124         36  20200407161042
1                           Dougherty       939         52  20200407161042
2                              Dekalb       645         11  20200407161042
3                                Cobb       550         29  20200407161042
4                            Gwinnett       525         10  20200407161042
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1185         39  20200408001918
1                           Dougherty       973         56  20200408001918
2                              Dekalb       673         11  20200408001918
3                                Cobb       566         29  20200408001918
4                            Gwinnett       540         13  20200408001918
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1261         42  20200409025326
1                           Dougherty      1001         62  20200409025326
2                              Dekalb       732         12  20200409025326
3                                Cobb       608         29  20200409025326
4                            Gwinnett       586         17  20200409025326
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1364         48  20200410161045
1                           Dougherty      1062         67  20200410161045
2                              Dekalb       795         13  20200410161045
3                                Cobb       662         33  20200410161045
4                            Gwinnett       646         17  20200410161045
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1422         50  20200411161053
1                           Dougherty      1076         69  20200411161053
2                              Dekalb       835         13  20200411161053
3                                Cobb       696         33  20200411161053
4                            Gwinnett       675         18  20200411161053
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1446         50  20200412013948
1                           Dougherty      1102         72  20200412013948
2                              Dekalb       848         14  20200412013948
3                                Cobb       705         32  20200412013948
4                            Gwinnett       681         18  20200412013948
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1598         52  20200413161051
1                           Dougherty      1217         77  20200413161051
2                              Dekalb       980         14  20200413161051
3                                Cobb       782         35  20200413161051
4                            Gwinnett       739         22  20200413161051
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1635         52  20200414004432
1                           Dougherty      1245         78  20200414004432
2                              Dekalb      1006         15  20200414004432
3                                Cobb       816         36  20200414004432
4                            Gwinnett       766         24  20200414004432
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1812         60  20200415022915
1                           Dougherty      1297         78  20200415022915
2                              Dekalb      1144         15  20200415022915
3                                Cobb       895         41  20200415022915
4                            Gwinnett       815         29  20200415022915
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1902         63  20200416021759
1                           Dougherty      1320         84  20200416021759
2                              Dekalb      1225         16  20200416021759
3                                Cobb       950         44  20200416021759
4                            Gwinnett       885         32  20200416021759
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      1945         66  20200417025925
1                           Dougherty      1358         88  20200417025925
2                              Dekalb      1260         21  20200417025925
3                                Cobb      1014         49  20200417025925
4                            Gwinnett       917         32  20200417025925
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      2037         74  20200418014416
1                           Dougherty      1385         91  20200418014416
2                              Dekalb      1366         23  20200418014416
3                                Cobb      1085         51  20200418014416
4                            Gwinnett      1037         36  20200418014416
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      2110         75  20200419161040
1                              Dekalb      1452         24  20200419161040
2                           Dougherty      1422         94  20200419161040
3                                Cobb      1125         51  20200419161040
4                            Gwinnett      1099         38  20200419161040
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      2131         75  20200420135015
1                              Dekalb      1473         24  20200420135015
2                           Dougherty      1425         90  20200420135015
3                                Cobb      1148         54  20200420135015
4                            Gwinnett      1124         38  20200420135015
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      2198         80  20200421025924
1                              Dekalb      1520         26  20200421025924
2                           Dougherty      1436         98  20200421025924
3                                Cobb      1196         56  20200421025924
4                            Gwinnett      1181         42  20200421025924
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      2222         84  20200422161044
1                              Dekalb      1587         30  20200422161044
2                           Dougherty      1468        107  20200422161044
3                            Gwinnett      1263         46  20200422161044
4                                Cobb      1259         61  20200422161044
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      2255         85  20200423051926
1                              Dekalb      1609         31  20200423051926
2                           Dougherty      1479        106  20200423051926
3                            Gwinnett      1273         46  20200423051926
4                                Cobb      1272         63  20200423051926
  COVID-19 Confirmed Cases By County: No. Cases No. Deaths       timestamp
0                              Fulton      2509         91  20200425203855
1                              Dekalb      1734         36  20200425203855
2                           Dougherty      1467        108  20200425203855
3                            Gwinnett      1421         46  20200425203855
4                                Cobb      1377         73  20200425203855
In [92]:
import glob, os    
df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('georgia', "*.csv"))))
In [99]:
df['timestamp'] = df['timestamp'].astype(str)
df['date'] = df.apply(lambda x: x['timestamp'][:4] + '-' + x['timestamp'][4:6] + '-' + x['timestamp'][6:8], axis=1)
df.head()
Out[99]:
COVID-19 Confirmed Cases By County: No. Cases No. Deaths timestamp date
0 Fulton 2198.0 80.0 20200421025924 2020-04-21
1 Dekalb 1520.0 26.0 20200421025924 2020-04-21
2 Dougherty 1436.0 98.0 20200421025924 2020-04-21
3 Cobb 1196.0 56.0 20200421025924 2020-04-21
4 Gwinnett 1181.0 42.0 20200421025924 2020-04-21
In [100]:
df.columns = ['county', 'cases', 'deaths', 'ts', 'date']
In [101]:
## PREPPING DATA FOR OG D3 RACES

train_group = pd.DataFrame(df.groupby(['county','date'])['cases'].sum())
train_group.reset_index(inplace = True)
train_group.columns = ['name', 'date', 'value']

## GETTING ONLY MARCH DATA
# train_group = train_group[train_group['date'] >= '2020-03-01']

reshaped_df = pd.DataFrame()
for country in list(set(train_group['name'])):
    df = train_group[train_group['name'] == country]
    new_df = df.pivot_table('value', 'name', 'date')
    reshaped_df = reshaped_df.append(new_df)

reshaped_df.reset_index(inplace=True)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py:6692: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  sort=sort)
In [102]:
reshaped_df
Out[102]:
date name 2020-03-31 2020-04-01 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08 ... 2020-04-15 2020-04-16 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22 2020-04-23 2020-04-25
0 Henry 68.0 90.0 95.0 115.0 143.0 167.0 168.0 194.0 208.0 ... 306.0 328.0 330.0 350.0 367.0 371.0 382.0 388.0 386.0 424.0
1 Barrow 9.0 20.0 20.0 20.0 22.0 25.0 25.0 40.0 42.0 ... 65.0 68.0 81.0 85.0 90.0 90.0 93.0 101.0 103.0 111.0
2 Paulding 26.0 32.0 33.0 35.0 46.0 50.0 51.0 60.0 62.0 ... 115.0 118.0 123.0 132.0 133.0 137.0 141.0 144.0 145.0 156.0
3 Twiggs 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 ... 3.0 3.0 4.0 4.0 4.0 4.0 4.0 4.0 6.0 8.0
4 Troup 15.0 19.0 19.0 21.0 30.0 34.0 34.0 46.0 47.0 ... 68.0 72.0 77.0 80.0 87.0 92.0 100.0 107.0 109.0 122.0
5 Grady NaN 1.0 2.0 2.0 3.0 3.0 3.0 5.0 6.0 ... 21.0 23.0 26.0 27.0 31.0 31.0 34.0 38.0 40.0 48.0
6 Bryan 7.0 8.0 9.0 11.0 15.0 16.0 17.0 26.0 26.0 ... 30.0 30.0 30.0 31.0 33.0 33.0 34.0 38.0 37.0 47.0
7 Crawford NaN NaN NaN NaN NaN 1.0 1.0 2.0 1.0 ... 9.0 10.0 12.0 13.0 13.0 13.0 14.0 14.0 14.0 14.0
8 Dawson 7.0 11.0 11.0 11.0 13.0 16.0 16.0 20.0 20.0 ... 36.0 37.0 39.0 42.0 44.0 44.0 45.0 48.0 49.0 51.0
9 *Based on patient county of residence when known 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
10 Wayne NaN NaN NaN NaN NaN 1.0 1.0 3.0 3.0 ... 6.0 6.0 7.0 7.0 7.0 7.0 8.0 10.0 10.0 11.0
11 Crisp 8.0 17.0 19.0 21.0 24.0 28.0 28.0 46.0 46.0 ... 83.0 93.0 104.0 113.0 118.0 118.0 125.0 128.0 130.0 151.0
12 Lumpkin 4.0 7.0 7.0 7.0 7.0 9.0 9.0 9.0 9.0 ... 11.0 11.0 19.0 22.0 23.0 23.0 24.0 26.0 26.0 29.0
13 Lee 44.0 94.0 94.0 104.0 123.0 136.0 138.0 195.0 204.0 ... 258.0 264.0 267.0 269.0 272.0 274.0 276.0 292.0 294.0 297.0
14 Wilkes 2.0 2.0 2.0 2.0 3.0 3.0 3.0 5.0 5.0 ... 8.0 9.0 11.0 11.0 17.0 18.0 18.0 21.0 21.0 22.0
15 Murray 4.0 5.0 5.0 7.0 7.0 9.0 9.0 11.0 11.0 ... 16.0 17.0 17.0 18.0 18.0 18.0 18.0 19.0 19.0 22.0
16 Butts 6.0 8.0 8.0 9.0 9.0 11.0 12.0 17.0 18.0 ... 35.0 36.0 38.0 40.0 43.0 50.0 80.0 83.0 125.0 125.0
17 Burke 4.0 8.0 8.0 8.0 8.0 9.0 9.0 14.0 15.0 ... 28.0 28.0 35.0 38.0 47.0 47.0 48.0 51.0 53.0 71.0
18 Brantley NaN NaN NaN NaN NaN NaN NaN 3.0 5.0 ... 12.0 14.0 16.0 17.0 18.0 19.0 20.0 20.0 20.0 20.0
19 Pierce 2.0 5.0 7.0 10.0 16.0 16.0 16.0 28.0 32.0 ... 47.0 48.0 49.0 49.0 51.0 51.0 51.0 51.0 51.0 52.0
20 Lanier NaN NaN 1.0 1.0 1.0 1.0 1.0 5.0 5.0 ... 5.0 5.0 6.0 6.0 6.0 6.0 7.0 7.0 7.0 8.0
21 Newton 22.0 32.0 34.0 37.0 42.0 55.0 55.0 65.0 67.0 ... 100.0 113.0 115.0 121.0 128.0 130.0 139.0 144.0 146.0 159.0
22 Clinch 1.0 1.0 1.0 2.0 2.0 2.0 2.0 3.0 3.0 ... 6.0 7.0 7.0 7.0 7.0 7.0 7.0 7.0 7.0 7.0
23 Dooly 3.0 10.0 10.0 11.0 12.0 15.0 15.0 28.0 27.0 ... 51.0 55.0 64.0 69.0 70.0 70.0 71.0 73.0 74.0 108.0
24 Seminole 7.0 6.0 5.0 6.0 8.0 8.0 9.0 12.0 13.0 ... 16.0 19.0 21.0 20.0 21.0 22.0 25.0 26.0 27.0 27.0
25 Madison 3.0 3.0 3.0 5.0 7.0 7.0 7.0 9.0 9.0 ... 12.0 13.0 14.0 14.0 14.0 14.0 15.0 18.0 18.0 21.0
26 Montgomery NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0
27 Berrien 1.0 2.0 2.0 3.0 3.0 3.0 3.0 4.0 4.0 ... 8.0 8.0 10.0 10.0 10.0 10.0 10.0 10.0 10.0 14.0
28 Catoosa 3.0 4.0 4.0 5.0 6.0 6.0 6.0 7.0 7.0 ... 15.0 17.0 18.0 19.0 19.0 20.0 23.0 29.0 45.0 46.0
29 Stephens 3.0 5.0 5.0 5.0 5.0 6.0 6.0 11.0 13.0 ... 24.0 24.0 28.0 30.0 35.0 35.0 40.0 41.0 42.0 56.0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
130 Ware 5.0 10.0 12.0 14.0 17.0 18.0 18.0 37.0 37.0 ... 64.0 67.0 76.0 77.0 85.0 86.0 88.0 95.0 95.0 103.0
131 Elbert NaN NaN NaN NaN 1.0 1.0 1.0 2.0 2.0 ... 6.0 7.0 7.0 7.0 7.0 7.0 7.0 10.0 10.0 16.0
132 Mcduffie 2.0 3.0 3.0 4.0 5.0 6.0 6.0 13.0 16.0 ... 30.0 31.0 35.0 35.0 37.0 37.0 37.0 37.0 37.0 42.0
133 Clarke 47.0 51.0 54.0 55.0 59.0 62.0 62.0 77.0 78.0 ... 95.0 102.0 102.0 106.0 108.0 110.0 114.0 116.0 118.0 127.0
134 Talbot 1.0 1.0 1.0 1.0 3.0 4.0 4.0 6.0 8.0 ... 10.0 13.0 15.0 16.0 16.0 16.0 19.0 22.0 21.0 21.0
135 Douglas 43.0 55.0 56.0 66.0 78.0 86.0 91.0 111.0 117.0 ... 189.0 201.0 209.0 222.0 230.0 233.0 243.0 253.0 259.0 276.0
136 Meriwether 6.0 7.0 7.0 9.0 10.0 11.0 11.0 17.0 20.0 ... 24.0 28.0 35.0 40.0 42.0 42.0 46.0 47.0 48.0 49.0
137 Gordon 15.0 18.0 18.0 20.0 21.0 23.0 23.0 25.0 25.0 ... 45.0 48.0 50.0 55.0 64.0 64.0 64.0 68.0 70.0 78.0
138 Hancock NaN NaN NaN NaN NaN NaN NaN 1.0 1.0 ... 5.0 6.0 6.0 6.0 7.0 7.0 10.0 10.0 11.0 16.0
139 Habersham NaN 2.0 2.0 2.0 2.0 2.0 2.0 6.0 7.0 ... 28.0 44.0 57.0 72.0 74.0 75.0 99.0 119.0 130.0 165.0
140 Fulton 503.0 624.0 638.0 747.0 910.0 962.0 970.0 1124.0 1185.0 ... 1812.0 1902.0 1945.0 2037.0 2110.0 2131.0 2198.0 2222.0 2255.0 2509.0
141 Turner 3.0 4.0 5.0 6.0 12.0 12.0 12.0 16.0 19.0 ... 41.0 46.0 49.0 51.0 56.0 58.0 59.0 59.0 59.0 62.0
142 Telfair NaN NaN NaN 2.0 2.0 4.0 4.0 4.0 4.0 ... 11.0 11.0 13.0 14.0 14.0 14.0 15.0 18.0 17.0 25.0
143 Screven NaN 2.0 2.0 2.0 2.0 2.0 2.0 3.0 4.0 ... 9.0 9.0 11.0 11.0 11.0 11.0 11.0 14.0 14.0 14.0
144 Rabun NaN 2.0 2.0 2.0 2.0 3.0 3.0 5.0 5.0 ... 7.0 7.0 7.0 7.0 7.0 7.0 8.0 8.0 9.0 9.0
145 Webster NaN 1.0 1.0 1.0 1.0 2.0 2.0 4.0 4.0 ... 3.0 3.0 4.0 5.0 5.0 5.0 6.0 7.0 7.0 7.0
146 Upson 5.0 5.0 6.0 6.0 8.0 13.0 13.0 28.0 29.0 ... 128.0 139.0 143.0 164.0 170.0 173.0 175.0 186.0 185.0 194.0
147 Oglethorpe NaN 1.0 1.0 1.0 1.0 1.0 1.0 2.0 2.0 ... 19.0 31.0 32.0 32.0 32.0 33.0 34.0 37.0 34.0 48.0
148 Effingham 4.0 5.0 5.0 6.0 6.0 9.0 12.0 17.0 18.0 ... 25.0 24.0 24.0 25.0 26.0 26.0 27.0 30.0 30.0 34.0
149 Jenkins 1.0 1.0 1.0 1.0 2.0 2.0 2.0 7.0 7.0 ... 11.0 12.0 12.0 12.0 12.0 12.0 13.0 14.0 14.0 16.0
150 Banks 1.0 3.0 3.0 3.0 3.0 3.0 3.0 3.0 4.0 ... 8.0 9.0 13.0 13.0 14.0 14.0 15.0 16.0 16.0 18.0
151 Atkinson NaN NaN NaN NaN NaN NaN NaN 4.0 4.0 ... 2.0 2.0 4.0 5.0 5.0 5.0 5.0 5.0 5.0 7.0
152 Jasper 2.0 2.0 2.0 2.0 2.0 4.0 3.0 6.0 6.0 ... 9.0 10.0 10.0 14.0 17.0 17.0 17.0 18.0 18.0 20.0
153 Marion NaN NaN NaN NaN 1.0 2.0 2.0 3.0 4.0 ... 17.0 17.0 20.0 26.0 28.0 28.0 29.0 33.0 33.0 35.0
154 Cherokee 69.0 83.0 85.0 94.0 114.0 122.0 124.0 144.0 147.0 ... 219.0 232.0 245.0 274.0 293.0 300.0 317.0 331.0 340.0 371.0
155 Harris 4.0 3.0 3.0 5.0 6.0 7.0 7.0 12.0 12.0 ... 23.0 27.0 34.0 36.0 39.0 42.0 45.0 48.0 49.0 55.0
156 Non-Georgia Resident NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 665.0 677.0 724.0 774.0 868.0 870.0 951.0 1002.0 1003.0 957.0
157 Franklin 4.0 5.0 5.0 5.0 5.0 5.0 5.0 3.0 3.0 ... 5.0 7.0 10.0 10.0 10.0 10.0 12.0 13.0 13.0 16.0
158 Jones 2.0 4.0 4.0 5.0 6.0 6.0 6.0 7.0 7.0 ... 14.0 15.0 15.0 16.0 17.0 17.0 20.0 22.0 22.0 24.0
159 Tift 19.0 24.0 28.0 29.0 34.0 34.0 34.0 43.0 45.0 ... 71.0 76.0 79.0 80.0 86.0 91.0 92.0 91.0 93.0 99.0

160 rows × 26 columns

In [103]:
reshaped_df.to_csv('all_georgia.csv', index=False)
In [3]:
df = pd.read_csv('all_georgia.csv')
In [38]:
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)
In [39]:
counties['features']['']
---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-39-818e2421ea35> in <module>
----> 1 counties['features']['']

TypeError: list indices must be integers or slices, not str
In [ ]:
set(list(df['name']))

fips = pd.read_csv('county_fips.csv')
fips_ga = fips[fips['State'] == 'GA']
fips_ga_dict = dict(zip(fips_ga.Name,fips_ga.FIPS)) 
fips_ga_dict
In [40]:
fips = pd.read_csv('county_fips.csv')
In [41]:
fips_ga = fips[fips['State'] == 'GA']
In [42]:
fips_ga_dict = dict(zip(fips_ga.Name,fips_ga.FIPS)) 
fips_ga_dict
Out[42]:
{'Appling': 13001,
 'Atkinson': 13003,
 'Bacon': 13005,
 'Baker': 13007,
 'Baldwin': 13009,
 'Banks': 13011,
 'Barrow': 13013,
 'Bartow': 13015,
 'Ben Hill': 13017,
 'Berrien': 13019,
 'Bibb': 13021,
 'Bleckley': 13023,
 'Brantley': 13025,
 'Brooks': 13027,
 'Bryan': 13029,
 'Bulloch': 13031,
 'Burke': 13033,
 'Butts': 13035,
 'Calhoun': 13037,
 'Camden': 13039,
 'Candler': 13043,
 'Carroll': 13045,
 'Catoosa': 13047,
 'Charlton': 13049,
 'Chatham': 13051,
 'Chattahoochee': 13053,
 'Chattooga': 13055,
 'Cherokee': 13057,
 'Clarke': 13059,
 'Clay': 13061,
 'Clayton': 13063,
 'Clinch': 13065,
 'Cobb': 13067,
 'Coffee': 13069,
 'Colquitt': 13071,
 'Columbia': 13073,
 'Cook': 13075,
 'Coweta': 13077,
 'Crawford': 13079,
 'Crisp': 13081,
 'Dade': 13083,
 'Dawson': 13085,
 'Decatur': 13087,
 'De Kalb': 13089,
 'Dodge': 13091,
 'Dooly': 13093,
 'Dougherty': 13095,
 'Douglas': 13097,
 'Early': 13099,
 'Echols': 13101,
 'Effingham': 13103,
 'Elbert': 13105,
 'Emanuel': 13107,
 'Evans': 13109,
 'Fannin': 13111,
 'Fayette': 13113,
 'Floyd': 13115,
 'Forsyth': 13117,
 'Franklin': 13119,
 'Fulton': 13121,
 'Gilmer': 13123,
 'Glascock': 13125,
 'Glynn': 13127,
 'Gordon': 13129,
 'Grady': 13131,
 'Greene': 13133,
 'Gwinnett': 13135,
 'Habersham': 13137,
 'Hall': 13139,
 'Hancock': 13141,
 'Haralson': 13143,
 'Harris': 13145,
 'Hart': 13147,
 'Heard': 13149,
 'Henry': 13151,
 'Houston': 13153,
 'Irwin': 13155,
 'Jackson': 13157,
 'Jasper': 13159,
 'Jeff Davis': 13161,
 'Jefferson': 13163,
 'Jenkins': 13165,
 'Johnson': 13167,
 'Jones': 13169,
 'Lamar': 13171,
 'Lanier': 13173,
 'Laurens': 13175,
 'Lee': 13177,
 'Liberty': 13179,
 'Lincoln': 13181,
 'Long': 13183,
 'Lowndes': 13185,
 'Lumpkin': 13187,
 'McDuffie': 13189,
 'McIntosh': 13191,
 'Macon': 13193,
 'Madison': 13195,
 'Marion': 13197,
 'Meriwether': 13199,
 'Miller': 13201,
 'Mitchell': 13205,
 'Monroe': 13207,
 'Montgomery': 13209,
 'Morgan': 13211,
 'Murray': 13213,
 'Muscogee': 13215,
 'Newton': 13217,
 'Oconee': 13219,
 'Oglethorpe': 13221,
 'Paulding': 13223,
 'Peach': 13225,
 'Pickens': 13227,
 'Pierce': 13229,
 'Pike': 13231,
 'Polk': 13233,
 'Pulaski': 13235,
 'Putnam': 13237,
 'Quitman': 13239,
 'Rabun': 13241,
 'Randolph': 13243,
 'Richmond': 13245,
 'Rockdale': 13247,
 'Schley': 13249,
 'Screven': 13251,
 'Seminole': 13253,
 'Spalding': 13255,
 'Stephens': 13257,
 'Stewart': 13259,
 'Sumter': 13261,
 'Talbot': 13263,
 'Taliaferro': 13265,
 'Tattnall': 13267,
 'Taylor': 13269,
 'Telfair': 13271,
 'Terrell': 13273,
 'Thomas': 13275,
 'Tift': 13277,
 'Toombs': 13279,
 'Towns': 13281,
 'Treutlen': 13283,
 'Troup': 13285,
 'Turner': 13287,
 'Twiggs': 13289,
 'Union': 13291,
 'Upson': 13293,
 'Walker': 13295,
 'Walton': 13297,
 'Ware': 13299,
 'Warren': 13301,
 'Washington': 13303,
 'Wayne': 13305,
 'Webster': 13307,
 'Wheeler': 13309,
 'White': 13311,
 'Whitfield': 13313,
 'Wilcox': 13315,
 'Wilkes': 13317,
 'Wilkinson': 13319,
 'Worth': 13321}
In [43]:
def get_fips(county):
    try:
        return fips_ga_dict[county]
    except:
        return 'na'
In [12]:
df['fips'] = df.apply(lambda x: get_fips(x['name']), axis=1)
df.head()
Out[12]:
name 2020-03-31 2020-04-01 2020-04-02 2020-04-03 2020-04-04 2020-04-05 2020-04-06 2020-04-07 2020-04-08 ... 2020-04-16 2020-04-17 2020-04-18 2020-04-19 2020-04-20 2020-04-21 2020-04-22 2020-04-23 2020-04-25 fips
0 Henry 68.0 90.0 95.0 115.0 143.0 167.0 168.0 194.0 208.0 ... 328.0 330.0 350.0 367.0 371.0 382.0 388.0 386.0 424.0 13151
1 Barrow 9.0 20.0 20.0 20.0 22.0 25.0 25.0 40.0 42.0 ... 68.0 81.0 85.0 90.0 90.0 93.0 101.0 103.0 111.0 13013
2 Paulding 26.0 32.0 33.0 35.0 46.0 50.0 51.0 60.0 62.0 ... 118.0 123.0 132.0 133.0 137.0 141.0 144.0 145.0 156.0 13223
3 Twiggs 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 2.0 ... 3.0 4.0 4.0 4.0 4.0 4.0 4.0 6.0 8.0 13289
4 Troup 15.0 19.0 19.0 21.0 30.0 34.0 34.0 46.0 47.0 ... 72.0 77.0 80.0 87.0 92.0 100.0 107.0 109.0 122.0 13285

5 rows × 27 columns

In [21]:
df.columns[-2:]]
In [11]:
df_sm.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-11-e2012bfd10d4> in <module>
----> 1 df_sm.head()

NameError: name 'df_sm' is not defined
In [23]:
df_sm.to_csv('georgia_latest.csv', index=False)
In [1]:
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/plotly/datasets/master/fips-unemp-16.csv",
                   dtype={"fips": str})

import plotly.express as px

fig = px.choropleth(df, geojson=counties, locations='fips', color='unemp',
                           color_continuous_scale="Viridis",
                           range_color=(0, 12),
                           scope="usa",
                           labels={'unemp':'unemployment rate'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
In [22]:
import pandas as pd
g = pd.read_csv('georgia_latest.csv')
In [14]:
# df['2020-04-25'].max()
Out[14]:
2509.0
In [24]:
g.columns[0]
Out[24]:
'2020-04-25'
In [67]:
date = '2020-04-25'
fig = px.choropleth(df, geojson=counties, locations='fips', color=date,
                           color_continuous_scale="Viridis",
                           range_color=(0, 500),
                           scope="usa",
                            
                           labels={'2020-04-25':'infection rate'}
                          )
fig.update_geos(fitbounds="locations", visible=False)
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, title="hello")
fig.update_layout(
    title={
        'text': date,
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'})
fig.show()
In [3]:
df = pd.read_csv('all_georgia.csv')
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-ea93e001cbb1> in <module>
----> 1 df = pd.read_csv('all_georgia.csv')

NameError: name 'pd' is not defined
In [47]:
def get_fips(county):
    try:
        return fips_ga_dict[county]
    except:
        return 'na'
    
df['fips'] = df.apply(lambda x: get_fips(x['name']), axis=1)
    
In [81]:
def make_plot(df):
    date = df.columns[0]
    metric = df.columns[1]
    fig = px.choropleth(df, geojson=counties, locations='fips', color=metric,
                               color_continuous_scale="Viridis",
                               range_color=(0, 100),
                               scope="usa",

                               labels={'2020-04-25':'new'}
                              )
    fig.update_geos(fitbounds="locations", visible=False)
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0}, title=date)
    filename = 'georgianew' + date + '.png'
    
    fig.update_layout(
        title={
            'text': date,
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'})
    
    fig.write_image(filename)
    fig.show()
In [83]:
for col in df.columns[1:3]:
# for col in df.columns[1:-1]:
    
    sm_df = df[[col, 'fips']]
    sm_df['new'] = sm_df[col].diff()
    sm_df = sm_df[[col, 'new', 'fips']]
    print(sm_df)


#     make_plot(sm_df)
     2020-03-31    new   fips
0          68.0    NaN  13151
1           9.0  -59.0  13013
2          26.0   17.0  13223
3           2.0  -24.0  13289
4          15.0   13.0  13285
5           NaN    NaN  13131
6           7.0    NaN  13029
7           NaN    NaN  13079
8           7.0    NaN  13085
9           0.0   -7.0     na
10          NaN    NaN  13305
11          8.0    NaN  13081
12          4.0   -4.0  13187
13         44.0   40.0  13177
14          2.0  -42.0  13317
15          4.0    2.0  13213
16          6.0    2.0  13035
17          4.0   -2.0  13033
18          NaN    NaN  13025
19          2.0    NaN  13229
20          NaN    NaN  13173
21         22.0    NaN  13217
22          1.0  -21.0  13065
23          3.0    2.0  13093
24          7.0    4.0  13253
25          3.0   -4.0  13195
26          NaN    NaN  13209
27          1.0    NaN  13019
28          3.0    2.0  13047
29          3.0    0.0  13257
..          ...    ...    ...
130         5.0    NaN  13299
131         NaN    NaN  13105
132         2.0    NaN     na
133        47.0   45.0  13059
134         1.0  -46.0  13263
135        43.0   42.0  13097
136         6.0  -37.0  13199
137        15.0    9.0  13129
138         NaN    NaN  13141
139         NaN    NaN  13137
140       503.0    NaN  13121
141         3.0 -500.0  13287
142         NaN    NaN  13271
143         NaN    NaN  13251
144         NaN    NaN  13241
145         NaN    NaN  13307
146         5.0    NaN  13293
147         NaN    NaN  13221
148         4.0    NaN  13103
149         1.0   -3.0  13165
150         1.0    0.0  13011
151         NaN    NaN  13003
152         2.0    NaN  13159
153         NaN    NaN  13197
154        69.0    NaN  13057
155         4.0  -65.0  13145
156         NaN    NaN     na
157         4.0    NaN  13119
158         2.0   -2.0  13169
159        19.0   17.0  13277

[160 rows x 3 columns]
     2020-04-01    new   fips
0          90.0    NaN  13151
1          20.0  -70.0  13013
2          32.0   12.0  13223
3           2.0  -30.0  13289
4          19.0   17.0  13285
5           1.0  -18.0  13131
6           8.0    7.0  13029
7           NaN    NaN  13079
8          11.0    NaN  13085
9           0.0  -11.0     na
10          NaN    NaN  13305
11         17.0    NaN  13081
12          7.0  -10.0  13187
13         94.0   87.0  13177
14          2.0  -92.0  13317
15          5.0    3.0  13213
16          8.0    3.0  13035
17          8.0    0.0  13033
18          NaN    NaN  13025
19          5.0    NaN  13229
20          NaN    NaN  13173
21         32.0    NaN  13217
22          1.0  -31.0  13065
23         10.0    9.0  13093
24          6.0   -4.0  13253
25          3.0   -3.0  13195
26          NaN    NaN  13209
27          2.0    NaN  13019
28          4.0    2.0  13047
29          5.0    1.0  13257
..          ...    ...    ...
130        10.0    NaN  13299
131         NaN    NaN  13105
132         3.0    NaN     na
133        51.0   48.0  13059
134         1.0  -50.0  13263
135        55.0   54.0  13097
136         7.0  -48.0  13199
137        18.0   11.0  13129
138         NaN    NaN  13141
139         2.0    NaN  13137
140       624.0  622.0  13121
141         4.0 -620.0  13287
142         NaN    NaN  13271
143         2.0    NaN  13251
144         2.0    0.0  13241
145         1.0   -1.0  13307
146         5.0    4.0  13293
147         1.0   -4.0  13221
148         5.0    4.0  13103
149         1.0   -4.0  13165
150         3.0    2.0  13011
151         NaN    NaN  13003
152         2.0    NaN  13159
153         NaN    NaN  13197
154        83.0    NaN  13057
155         3.0  -80.0  13145
156         NaN    NaN     na
157         5.0    NaN  13119
158         4.0   -1.0  13169
159        24.0   20.0  13277

[160 rows x 3 columns]
/Users/danielcaraway/.local/lib/python3.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy

In [13]:
import pandas as pd
import plotly.express as px

from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json') as response:
    counties = json.load(response)

fips = pd.read_csv('county_fips.csv')
fips_ga = fips[fips['State'] == 'GA']
fips_ga_dict = dict(zip(fips_ga.Name,fips_ga.FIPS)) 
fips_ga_dict

df = pd.read_csv('all_georgia.csv')

def get_fips(county):
    try:
        return fips_ga_dict[county]
    except:
        return 'na'
    
df['fips'] = df.apply(lambda x: get_fips(x['name']), axis=1)
    
In [33]:
def make_plot(df):
    date = df.columns[0]
    fig = px.choropleth(df, geojson=counties, locations='fips', color=date,
                               color_continuous_scale="Viridis",
                               range_color=(0, 2500),
                               scope="usa",

                               labels={'2020-04-25':'infected'}
                              )
    fig.update_geos(fitbounds="locations", visible=False)
    fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
    fig.update_layout(
        title={
            'text': date,
            'y':0.9,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top'},
    coloraxis_colorbar=dict(
    title="Number of infections",
            titleside="top",
        tickmode="array",
        tickvals=[0, 500, 1000, 1500, 2000, 2500],
        ticktext=["0", "500", "1000", "1500", "2000", "2500"],
        ticks="outside"
    )
    )
    filename = 'georgia_v5' + date + '.png'
    fig.write_image(filename)
    fig.show()
In [ ]:
# for col in df.columns[1:3]:
for col in df.columns[1:-1]:
    sm_df = df[[col, 'fips']]
    make_plot(sm_df)
In [ ]: