In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
/kaggle/input/submission-prophet/submission_prophet.csv
/kaggle/input/covid19-global-forecasting-week-3/train.csv
/kaggle/input/covid19-global-forecasting-week-3/submission.csv
/kaggle/input/covid19-global-forecasting-week-3/test.csv
In [2]:
train_file = "/kaggle/input/covid19-global-forecasting-week-3/train.csv"
sub_file = "/kaggle/input/covid19-global-forecasting-week-3/submission.csv"
test_file = "/kaggle/input/covid19-global-forecasting-week-3/test.csv"
In [3]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
sub = pd.read_csv(sub_file)
In [4]:
# def use_country(state, country):
#     if pd.isna(state):
#         return country
#     else:
#         return state

# train['Province_State'] = train.apply(lambda x: use_country(x['Province_State'], x['Country_Region']), axis=1)
# test['Province_State'] = test.apply(lambda x: use_country(x['Province_State'], x['Country_Region']), axis=1)
In [5]:
# train['Province_State'].fillna('', inplace=True)
# test['Province_State'].fillna('', inplace=True)
# train['Date'] =  pd.to_datetime(train['Date'])
# test['Date'] =  pd.to_datetime(test['Date'])
# train = train.sort_values(['Country_Region','Province_State','Date'])
# test = test.sort_values(['Country_Region','Province_State','Date'])
# train[['ConfirmedCases', 'Fatalities']] = train.groupby(['Country_Region', 'Province_State'])[['ConfirmedCases', 'Fatalities']].transform('cummax') 
In [6]:
# from sklearn.linear_model import LinearRegression, BayesianRidge
# from sklearn.preprocessing import PolynomialFeatures
# from sklearn.pipeline import make_pipeline
In [7]:
# from tqdm import tqdm

# def RMSLE(pred,actual):
#     return np.sqrt(np.mean(np.power((np.log(pred+1)-np.log(actual+1)),2)))

# import warnings
# from statsmodels.tsa.statespace.sarimax import SARIMAX
# from statsmodels.tsa.arima_model import ARIMA

# feature_day = [1,5,25,50,75,100,150,200]
# def CreateInput(data):
#     feature = []
#     for day in feature_day:
#         #Get information in train data
#         data.loc[:,'Number day from ' + str(day) + ' case'] = 0
#         if (train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['ConfirmedCases'] < day)]['Date'].count() > 0):
#             fromday = train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['ConfirmedCases'] < day)]['Date'].max()        
#         else:
#             fromday = train[(train['Country_Region'] == country) & (train['Province_State'] == province)]['Date'].min()       
#         for i in range(0, len(data)):
#             if (data['Date'].iloc[i] > fromday):
#                 day_denta = data['Date'].iloc[i] - fromday
#                 data['Number day from ' + str(day) + ' case'].iloc[i] = day_denta.days 
#         feature = feature + ['Number day from ' + str(day) + ' case']
    
#     return data[feature]
# pred_data_all = pd.DataFrame()
# with tqdm(total=len(train['Country_Region'].unique())) as pbar:
#     for country in train['Country_Region'].unique():
#     #for country in ['Vietnam']:
#         for province in train[(train['Country_Region'] == country)]['Province_State'].unique():
#             with warnings.catch_warnings():
#                 warnings.filterwarnings("ignore")
#                 df_train = train[(train['Country_Region'] == country) & (train['Province_State'] == province)]
#                 df_test = test[(test['Country_Region'] == country) & (test['Province_State'] == province)]
#                 X_train = CreateInput(df_train)
#                 y_train_confirmed = df_train['ConfirmedCases'].ravel()
#                 y_train_fatalities = df_train['Fatalities'].ravel()
#                 X_pred = CreateInput(df_test)

#                 feature_use = X_pred.columns[0]
#                 for i in range(X_pred.shape[1] - 1,0,-1):
#                     if (X_pred.iloc[0,i] > 0):
#                         feature_use = X_pred.columns[i]
#                         break
#                 idx = X_train[X_train[feature_use] == 0].shape[0] 

                
#                 adjusted_X_train = X_train[idx:][feature_use].values.reshape(-1, 1)
#                 adjusted_y_train_confirmed = y_train_confirmed[idx:]
#                 adjusted_y_train_fatalities = y_train_fatalities[idx:] #.values.reshape(-1, 1)
#                 idx = X_pred[X_pred[feature_use] == 0].shape[0]    
#                 adjusted_X_pred = X_pred[idx:][feature_use].values.reshape(-1, 1)

                
                
#                 pred_data = test[(test['Country_Region'] == country) & (test['Province_State'] == province)]
#                 max_train_date = train[(train['Country_Region'] == country) & (train['Province_State'] == province)]['Date'].max()
#                 min_test_date = pred_data['Date'].min()
#                 model = SARIMAX(adjusted_y_train_confirmed, order=(1,1,0), 
#                                 #seasonal_order=(1,1,0,12),
#                                 measurement_error=True).fit(disp=False)
                
                
                
#                 y_hat_confirmed = model.forecast(pred_data[pred_data['Date'] > max_train_date].shape[0])
#                 y_train_confirmed = train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['Date'] >=  min_test_date)]['ConfirmedCases'].values
#                 y_hat_confirmed = np.concatenate((y_train_confirmed,y_hat_confirmed), axis = 0)
#                 model = SARIMAX(adjusted_y_train_fatalities, order=(1,1,0), 
#                                 #seasonal_order=(1,1,0,12),
#                                 measurement_error=True).fit(disp=False)
                
                
                
#                 y_hat_fatalities = model.forecast(pred_data[pred_data['Date'] > max_train_date].shape[0])
#                 y_train_fatalities = train[(train['Country_Region'] == country) & (train['Province_State'] == province) & (train['Date'] >=  min_test_date)]['Fatalities'].values
#                 y_hat_fatalities = np.concatenate((y_train_fatalities,y_hat_fatalities), axis = 0)
#                 pred_data['ConfirmedCases_hat'] =  y_hat_confirmed
#                 pred_data['Fatalities_hat'] = y_hat_fatalities
#                 pred_data_all = pred_data_all.append(pred_data)
#         pbar.update(1)
# df_val = pd.merge(pred_data_all,train[['Date','Country_Region','Province_State','ConfirmedCases','Fatalities']],on=['Date','Country_Region','Province_State'], how='left')
# df_val.loc[df_val['Fatalities_hat'] < 0,'Fatalities_hat'] = 0
# df_val.loc[df_val['ConfirmedCases_hat'] < 0,'ConfirmedCases_hat'] = 0
# df_val_3 = df_val.copy()
In [8]:
# df_val = df_val_3
# submission = df_val[['ForecastId','ConfirmedCases_hat','Fatalities_hat']]
# submission.columns = ['ForecastId','ConfirmedCases','Fatalities']
# submission = submission.round({'ConfirmedCases': 0, 'Fatalities': 0})
# submission.to_csv('submission.csv', index=False)
# submission
In [9]:
prophet = pd.read_csv('/kaggle/input/submission-prophet/submission_prophet.csv')
In [10]:
prophet
Out[10]:
ForecastId ConfirmedCases Fatalities
0 1 209.299217 3.179487
1 2 216.207390 3.577285
2 3 223.115815 3.975121
3 4 230.024239 4.372970
4 5 236.932664 4.770819
... ... ... ...
13153 13154 23.242448 2.009881
13154 13155 23.768728 2.054925
13155 13156 24.295008 2.099970
13156 13157 24.821288 2.145015
13157 13158 25.347567 2.190060

13158 rows × 3 columns

In [ ]:
 
In [11]:
prophet = prophet.fillna(0)
In [12]:
prophet.to_csv('submission.csv', index=False)
In [ ]: