In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
/kaggle/input/covid19-global-forecasting-week-2/test.csv
/kaggle/input/covid19-global-forecasting-week-2/train.csv
/kaggle/input/covid19-global-forecasting-week-2/submission.csv
In [2]:
df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')
df_submission = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')
df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')
In [3]:
df_train.rename(columns={'Country_Region':'Country'}, inplace=True)
df_test.rename(columns={'Country_Region':'Country'}, inplace=True)

df_train.rename(columns={'Province_State':'State'}, inplace=True)
df_test.rename(columns={'Province_State':'State'}, inplace=True)

df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True)
df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True)
In [4]:
y1_Train = df_train.iloc[:, -2]
y1_Train.head()

y2_Train = df_train.iloc[:, -1]
y2_Train.head()
Out[4]:
0    0.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: Fatalities, dtype: float64
In [5]:
EMPTY_VAL = "EMPTY_VAL"

def fillState(state, country):
    if state == EMPTY_VAL: return country
    return state
In [6]:
#X_Train = df_train.loc[:, ['State', 'Country', 'Date']]
X_Train = df_train.copy()

X_Train['State'].fillna(EMPTY_VAL, inplace=True)
X_Train['State'] = X_Train.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)

X_Train.loc[:, 'Date'] = X_Train.Date.dt.strftime("%m%d")
X_Train["Date"]  = X_Train["Date"].astype(int)

X_Train.head()
Out[6]:
Id State Country Date ConfirmedCases Fatalities
0 1 Afghanistan Afghanistan 122 0.0 0.0
1 2 Afghanistan Afghanistan 123 0.0 0.0
2 3 Afghanistan Afghanistan 124 0.0 0.0
3 4 Afghanistan Afghanistan 125 0.0 0.0
4 5 Afghanistan Afghanistan 126 0.0 0.0
In [7]:
#X_Test = df_test.loc[:, ['State', 'Country', 'Date']]
X_Test = df_test.copy()

X_Test['State'].fillna(EMPTY_VAL, inplace=True)
X_Test['State'] = X_Test.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)

X_Test.loc[:, 'Date'] = X_Test.Date.dt.strftime("%m%d")
X_Test["Date"]  = X_Test["Date"].astype(int)

X_Test.head()
Out[7]:
ForecastId State Country Date
0 1 Afghanistan Afghanistan 319
1 2 Afghanistan Afghanistan 320
2 3 Afghanistan Afghanistan 321
3 4 Afghanistan Afghanistan 322
4 5 Afghanistan Afghanistan 323
In [8]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()

X_Train.Country = le.fit_transform(X_Train.Country)
X_Train['State'] = le.fit_transform(X_Train['State'])

X_Train.head()
Out[8]:
Id State Country Date ConfirmedCases Fatalities
0 1 0 0 122 0.0 0.0
1 2 0 0 123 0.0 0.0
2 3 0 0 124 0.0 0.0
3 4 0 0 125 0.0 0.0
4 5 0 0 126 0.0 0.0
In [9]:
X_Test.Country = le.fit_transform(X_Test.Country)
X_Test['State'] = le.fit_transform(X_Test['State'])

X_Test.head()
Out[9]:
ForecastId State Country Date
0 1 0 0 319
1 2 0 0 320
2 3 0 0 321
3 4 0 0 322
4 5 0 0 323
In [10]:
from sklearn.model_selection import GridSearchCV 
import time 
param_grid = {'n_estimators': [1000]}

def gridSearchCV(model, X_Train, y_Train, param_grid, cv=10, scoring='neg_mean_squared_error'): 
    start = time.time()
In [11]:
from xgboost import XGBRegressor

model = XGBRegressor()

model1 = gridSearchCV(model, X_Train, y1_Train, param_grid, 10, 'neg_mean_squared_error') 
model2 = gridSearchCV(model, X_Train, y2_Train, param_grid, 10, 'neg_mean_squared_error')
In [12]:
from sklearn import preprocessing

le = preprocessing.LabelEncoder()
In [13]:
from xgboost import XGBRegressor

countries = X_Train.Country.unique()

#models_C = {}
#models_F = {}

df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})

for country in countries:
    states = X_Train.loc[X_Train.Country == country, :].State.unique()
    #print(country, states)
    # check whether string is nan or not
    for state in states:
        X_Train_CS = X_Train.loc[(X_Train.Country == country) & (X_Train.State == state), ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']]
        
        y1_Train_CS = X_Train_CS.loc[:, 'ConfirmedCases']
        y2_Train_CS = X_Train_CS.loc[:, 'Fatalities']
        
        X_Train_CS = X_Train_CS.loc[:, ['State', 'Country', 'Date']]
        
        X_Train_CS.Country = le.fit_transform(X_Train_CS.Country)
        X_Train_CS['State'] = le.fit_transform(X_Train_CS['State'])
        
        X_Test_CS = X_Test.loc[(X_Test.Country == country) & (X_Test.State == state), ['State', 'Country', 'Date', 'ForecastId']]
        
        X_Test_CS_Id = X_Test_CS.loc[:, 'ForecastId']
        X_Test_CS = X_Test_CS.loc[:, ['State', 'Country', 'Date']]
        
        X_Test_CS.Country = le.fit_transform(X_Test_CS.Country)
        X_Test_CS['State'] = le.fit_transform(X_Test_CS['State'])
        
        #models_C[country] = gridSearchCV(model, X_Train_CS, y1_Train_CS, param_grid, 10, 'neg_mean_squared_error')
        #models_F[country] = gridSearchCV(model, X_Train_CS, y2_Train_CS, param_grid, 10, 'neg_mean_squared_error')
        
        model1 = XGBRegressor(n_estimators=1000)
        model1.fit(X_Train_CS, y1_Train_CS)
        y1_pred = model1.predict(X_Test_CS)
        
        model2 = XGBRegressor(n_estimators=1000)
        model2.fit(X_Train_CS, y2_Train_CS)
        y2_pred = model2.predict(X_Test_CS)
        
        df = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y1_pred, 'Fatalities': y2_pred})
        df_out = pd.concat([df_out, df], axis=0)
    # Done for state loop
# Done for country Loop
In [14]:
df_out.ForecastId = df_out.ForecastId.astype('int')
df_out.tail()
Out[14]:
ForecastId ConfirmedCases Fatalities
12637 12638 6.998692 0.999658
12638 12639 6.998692 0.999658
12639 12640 6.998692 0.999658
12640 12641 6.998692 0.999658
12641 12642 6.998692 0.999658
In [15]:
df_out.to_csv('submission.csv', index=False)
In [ ]: