In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
/kaggle/input/covid19-global-forecasting-week-2/submission.csv
/kaggle/input/covid19-global-forecasting-week-2/test.csv
/kaggle/input/covid19-global-forecasting-week-2/train.csv
In [2]:
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots
import plotly
plotly.offline.init_notebook_mode() # For not show up chart error

import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
%matplotlib inline

from tqdm import tqdm

def RMSLE(pred,actual):
    return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))
In [3]:
pd.set_option('mode.chained_assignment', None)
test = pd.read_csv("../input/covid19-global-forecasting-week-2/test.csv")
train = pd.read_csv("../input/covid19-global-forecasting-week-2/train.csv")
train['Province_State'].fillna('', inplace=True)
test['Province_State'].fillna('', inplace=True)
train['Date'] =  pd.to_datetime(train['Date'])
test['Date'] =  pd.to_datetime(test['Date'])
train = train.sort_values(['Country_Region','Province_State','Date'])
test = test.sort_values(['Country_Region','Province_State','Date'])
train[['ConfirmedCases', 'Fatalities']] = train.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases', 'Fatalities']].transform('cummax') 
In [4]:
from sklearn.linear_model import LinearRegression, BayesianRidge
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline

feature_day = [1,20,50,100,200,500,1000]
def CreateInput(data):
    feature = []
    for day in feature_day:
        #Get information in train data
        data.loc[:,'Number day from ' + str(day) + ' case'] = 0
        if (train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['ConfirmedCases'] < day)]['Date'].count() > 0):
            fromday = train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['ConfirmedCases'] < day)]['Date'].max()        
        else:
            fromday = train[(train['Country_Region'] == country) & (train['Province_State'] == province)]['Date'].min()       
        for i in range(0, len(data)):
            if (data['Date'].iloc[i] > fromday):
                day_denta = data['Date'].iloc[i] - fromday
                data['Number day from ' + str(day) + ' case'].iloc[i] = day_denta.days 
        feature = feature + ['Number day from ' + str(day) + ' case']
    
    return data[feature]
pred_data_all = pd.DataFrame()
with tqdm(total=len(train['Country_Region'].unique())) as pbar:
    for country in train['Country_Region'].unique():
    #for country in ['Japan']:
        for province in train[(train['Country_Region'] == country)]['Province_State'].unique():
            df_train = train[(train['Country_Region'] == country) & (train['Province_State'] == province)]
            df_test = test[(test['Country_Region'] == country) & (test['Province_State'] == province)]
            X_train = CreateInput(df_train)
            y_train_confirmed = df_train['ConfirmedCases'].ravel()
            y_train_fatalities = df_train['Fatalities'].ravel()
            X_pred = CreateInput(df_test)

            # Define feature to use by X_pred
            feature_use = X_pred.columns[0]
            for i in range(X_pred.shape[1] - 1,0,-1):
                if (X_pred.iloc[0,i] > 0):
                    feature_use = X_pred.columns[i]
                    break
            idx = X_train[X_train[feature_use] == 0].shape[0]          
            adjusted_X_train = X_train[idx:][feature_use].values.reshape(-1, 1)
            adjusted_y_train_confirmed = y_train_confirmed[idx:]
            adjusted_y_train_fatalities = y_train_fatalities[idx:] #.values.reshape(-1, 1)
              
            adjusted_X_pred = X_pred[feature_use].values.reshape(-1, 1)

            model = make_pipeline(PolynomialFeatures(2), BayesianRidge())
            model.fit(adjusted_X_train,adjusted_y_train_confirmed)                
            y_hat_confirmed = model.predict(adjusted_X_pred)

            model.fit(adjusted_X_train,adjusted_y_train_fatalities)                
            y_hat_fatalities = model.predict(adjusted_X_pred)

            pred_data = test[(test['Country_Region'] == country) & (test['Province_State'] == province)]
            pred_data['ConfirmedCases_hat'] = y_hat_confirmed
            pred_data['Fatalities_hat'] = y_hat_fatalities
            pred_data_all = pred_data_all.append(pred_data)
        pbar.update(1)
    
df_val = pd.merge(pred_data_all,train[['Date','Country_Region','Province_State','ConfirmedCases','Fatalities']],on=['Date','Country_Region','Province_State'], how='left')
df_val.loc[df_val['Fatalities_hat'] < 0,'Fatalities_hat'] = 0
df_val.loc[df_val['ConfirmedCases_hat'] < 0,'ConfirmedCases_hat'] = 0

df_val_1 = df_val.copy()
100%|██████████| 173/173 [02:30<00:00,  1.15it/s]
In [5]:
RMSLE(df_val[(df_val['ConfirmedCases'].isnull() == False)]['ConfirmedCases'].values,df_val[(df_val['ConfirmedCases'].isnull() == False)]['ConfirmedCases_hat'].values)
Out[5]:
0.19742156774212843
In [6]:
RMSLE(df_val[(df_val['Fatalities'].isnull() == False)]['Fatalities'].values,df_val[(df_val['Fatalities'].isnull() == False)]['Fatalities_hat'].values)
Out[6]:
0.1828241464847915
In [7]:
val_score = []
for country in df_val['Country_Region'].unique():
    df_val_country = df_val[(df_val['Country_Region'] == country) & (df_val['Fatalities'].isnull() == False)]
    val_score.append([country, RMSLE(df_val_country['ConfirmedCases'].values,df_val_country['ConfirmedCases_hat'].values),RMSLE(df_val_country['Fatalities'].values,df_val_country['Fatalities_hat'].values)])
    
df_val_score = pd.DataFrame(val_score) 
df_val_score.columns = ['Country','ConfirmedCases_Scored','Fatalities_Scored']
df_val_score.sort_values('ConfirmedCases_Scored', ascending = False)
Out[7]:
Country ConfirmedCases_Scored Fatalities_Scored
101 Mali 1.121247 0.133608
164 Ukraine 0.854126 0.244006
163 Uganda 0.656863 0.000000
65 Grenada 0.565423 0.000000
68 Guinea-Bissau 0.522935 0.000000
... ... ... ...
99 Malaysia 0.020022 0.212704
151 Sweden 0.019551 0.183070
128 Poland 0.014691 0.082859
44 Diamond Princess 0.010608 0.084689
136 Saint Vincent and the Grenadines 0.000000 0.000000

173 rows × 3 columns

In [8]:
import warnings
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_model import ARIMA

feature_day = [1,20,50,100,200,500,1000]
def CreateInput(data):
    feature = []
    for day in feature_day:
        #Get information in train data
        data.loc[:,'Number day from ' + str(day) + ' case'] = 0
        if (train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['ConfirmedCases'] < day)]['Date'].count() > 0):
            fromday = train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['ConfirmedCases'] < day)]['Date'].max()        
        else:
            fromday = train[(train['Country_Region'] == country) & (train['Province_State'] == province)]['Date'].min()       
        for i in range(0, len(data)):
            if (data['Date'].iloc[i] > fromday):
                day_denta = data['Date'].iloc[i] - fromday
                data['Number day from ' + str(day) + ' case'].iloc[i] = day_denta.days 
        feature = feature + ['Number day from ' + str(day) + ' case']
    
    return data[feature]
pred_data_all = pd.DataFrame()
with tqdm(total=len(train['Country_Region'].unique())) as pbar:
    for country in train['Country_Region'].unique():
    #for country in ['Vietnam']:
        for province in train[(train['Country_Region'] == country)]['Province_State'].unique():
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore")
                df_train = train[(train['Country_Region'] == country) & (train['Province_State'] == province)]
                df_test = test[(test['Country_Region'] == country) & (test['Province_State'] == province)]
                X_train = CreateInput(df_train)
                y_train_confirmed = df_train['ConfirmedCases'].ravel()
                y_train_fatalities = df_train['Fatalities'].ravel()
                X_pred = CreateInput(df_test)

                feature_use = X_pred.columns[0]
                for i in range(X_pred.shape[1] - 1,0,-1):
                    if (X_pred.iloc[0,i] > 0):
                        feature_use = X_pred.columns[i]
                        break
                idx = X_train[X_train[feature_use] == 0].shape[0] 

                adjusted_X_train = X_train[idx:][feature_use].values.reshape(-1, 1)
                adjusted_y_train_confirmed = y_train_confirmed[idx:]
                adjusted_y_train_fatalities = y_train_fatalities[idx:] #.values.reshape(-1, 1)
                idx = X_pred[X_pred[feature_use] == 0].shape[0]    
                adjusted_X_pred = X_pred[idx:][feature_use].values.reshape(-1, 1)

                pred_data = test[(test['Country_Region'] == country) & (test['Province_State'] == province)]
                max_train_date = train[(train['Country_Region'] == country) & (train['Province_State'] == province)]['Date'].max()
                min_test_date = pred_data['Date'].min()
                model = SARIMAX(adjusted_y_train_confirmed, order=(1,1,0), 
                                #seasonal_order=(1,1,0,12),
                                measurement_error=True).fit(disp=False)
                y_hat_confirmed = model.forecast(pred_data[pred_data['Date'] > max_train_date].shape[0])
                y_train_confirmed = train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['Date'] >=  min_test_date)]['ConfirmedCases'].values
                y_hat_confirmed = np.concatenate((y_train_confirmed,y_hat_confirmed), axis = 0)
                model = SARIMAX(adjusted_y_train_fatalities, order=(1,1,0), 
                                #seasonal_order=(1,1,0,12),
                                measurement_error=True).fit(disp=False)
                y_hat_fatalities = model.forecast(pred_data[pred_data['Date'] > max_train_date].shape[0])
                y_train_fatalities = train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['Date'] >=  min_test_date)]['Fatalities'].values
                y_hat_fatalities = np.concatenate((y_train_fatalities,y_hat_fatalities), axis = 0)
                pred_data['ConfirmedCases_hat'] =  y_hat_confirmed
                pred_data['Fatalities_hat'] = y_hat_fatalities
                pred_data_all = pred_data_all.append(pred_data)
        pbar.update(1)
df_val = pd.merge(pred_data_all,train[['Date','Country_Region','Province_State','ConfirmedCases','Fatalities']],on=['Date','Country_Region','Province_State'], how='left')
df_val.loc[df_val['Fatalities_hat'] < 0,'Fatalities_hat'] = 0
df_val.loc[df_val['ConfirmedCases_hat'] < 0,'ConfirmedCases_hat'] = 0
df_val_3 = df_val.copy()
100%|██████████| 173/173 [03:06<00:00,  1.08s/it]
In [9]:
# method_list = ['Poly Bayesian Ridge','Exponential Smoothing','SARIMA']
# method_val = [df_val_1,df_val_2,df_val_3]
# for i in range(0,3):
#     df_val = method_val[i]
#     method_score = [method_list[i]] + [RMSLE(df_val[(df_val['ConfirmedCases'].isnull() == False)]['ConfirmedCases'].values,df_val[(df_val['ConfirmedCases'].isnull() == False)]['ConfirmedCases_hat'].values)] + [RMSLE(df_val[(df_val['Fatalities'].isnull() == False)]['Fatalities'].values,df_val[(df_val['Fatalities'].isnull() == False)]['Fatalities_hat'].values)]
#     print (method_score)
In [10]:
df_val = df_val_3
submission = df_val[['ForecastId','ConfirmedCases_hat','Fatalities_hat']]
submission.columns = ['ForecastId','ConfirmedCases','Fatalities']
submission = submission.round({'ConfirmedCases': 0, 'Fatalities': 0})
submission.to_csv('submission.csv', index=False)
submission
Out[10]:
ForecastId ConfirmedCases Fatalities
0 1 22.0 0.0
1 2 24.0 0.0
2 3 24.0 0.0
3 4 40.0 1.0
4 5 40.0 1.0
... ... ... ...
12637 12638 9.0 1.0
12638 12639 9.0 1.0
12639 12640 9.0 1.0
12640 12641 9.0 1.0
12641 12642 9.0 1.0

12642 rows × 3 columns

In [ ]: