{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
"_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"/kaggle/input/covid19-global-forecasting-week-2/test.csv\n",
"/kaggle/input/covid19-global-forecasting-week-2/train.csv\n",
"/kaggle/input/covid19-global-forecasting-week-2/submission.csv\n"
]
}
],
"source": [
"# This Python 3 environment comes with many helpful analytics libraries installed\n",
"# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n",
"# For example, here's several helpful packages to load in \n",
"\n",
"import numpy as np # linear algebra\n",
"import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n",
"\n",
"# Input data files are available in the \"../input/\" directory.\n",
"# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n",
"\n",
"import os\n",
"for dirname, _, filenames in os.walk('/kaggle/input'):\n",
" for filename in filenames:\n",
" print(os.path.join(dirname, filename))\n",
"\n",
"# Any results you write to the current directory are saved as output."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0",
"_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a"
},
"outputs": [],
"source": [
"df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')\n",
"df_submission = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')\n",
"df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"df_train.rename(columns={'Country_Region':'Country'}, inplace=True)\n",
"df_test.rename(columns={'Country_Region':'Country'}, inplace=True)\n",
"\n",
"df_train.rename(columns={'Province_State':'State'}, inplace=True)\n",
"df_test.rename(columns={'Province_State':'State'}, inplace=True)\n",
"\n",
"df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True)\n",
"df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 0.0\n",
"1 0.0\n",
"2 0.0\n",
"3 0.0\n",
"4 0.0\n",
"Name: Fatalities, dtype: float64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"y1_Train = df_train.iloc[:, -2]\n",
"y1_Train.head()\n",
"\n",
"y2_Train = df_train.iloc[:, -1]\n",
"y2_Train.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"EMPTY_VAL = \"EMPTY_VAL\"\n",
"\n",
"def fillState(state, country):\n",
" if state == EMPTY_VAL: return country\n",
" return state"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Id | \n",
" State | \n",
" Country | \n",
" Date | \n",
" ConfirmedCases | \n",
" Fatalities | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 122 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 123 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 124 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 125 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 126 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Id State Country Date ConfirmedCases Fatalities\n",
"0 1 Afghanistan Afghanistan 122 0.0 0.0\n",
"1 2 Afghanistan Afghanistan 123 0.0 0.0\n",
"2 3 Afghanistan Afghanistan 124 0.0 0.0\n",
"3 4 Afghanistan Afghanistan 125 0.0 0.0\n",
"4 5 Afghanistan Afghanistan 126 0.0 0.0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#X_Train = df_train.loc[:, ['State', 'Country', 'Date']]\n",
"X_Train = df_train.copy()\n",
"\n",
"X_Train['State'].fillna(EMPTY_VAL, inplace=True)\n",
"X_Train['State'] = X_Train.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)\n",
"\n",
"X_Train.loc[:, 'Date'] = X_Train.Date.dt.strftime(\"%m%d\")\n",
"X_Train[\"Date\"] = X_Train[\"Date\"].astype(int)\n",
"\n",
"X_Train.head()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ForecastId | \n",
" State | \n",
" Country | \n",
" Date | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 319 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 320 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 321 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 322 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" Afghanistan | \n",
" Afghanistan | \n",
" 323 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ForecastId State Country Date\n",
"0 1 Afghanistan Afghanistan 319\n",
"1 2 Afghanistan Afghanistan 320\n",
"2 3 Afghanistan Afghanistan 321\n",
"3 4 Afghanistan Afghanistan 322\n",
"4 5 Afghanistan Afghanistan 323"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#X_Test = df_test.loc[:, ['State', 'Country', 'Date']]\n",
"X_Test = df_test.copy()\n",
"\n",
"X_Test['State'].fillna(EMPTY_VAL, inplace=True)\n",
"X_Test['State'] = X_Test.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)\n",
"\n",
"X_Test.loc[:, 'Date'] = X_Test.Date.dt.strftime(\"%m%d\")\n",
"X_Test[\"Date\"] = X_Test[\"Date\"].astype(int)\n",
"\n",
"X_Test.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" Id | \n",
" State | \n",
" Country | \n",
" Date | \n",
" ConfirmedCases | \n",
" Fatalities | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 122 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 123 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 124 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 125 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 0 | \n",
" 0 | \n",
" 126 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" Id State Country Date ConfirmedCases Fatalities\n",
"0 1 0 0 122 0.0 0.0\n",
"1 2 0 0 123 0.0 0.0\n",
"2 3 0 0 124 0.0 0.0\n",
"3 4 0 0 125 0.0 0.0\n",
"4 5 0 0 126 0.0 0.0"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from sklearn import preprocessing\n",
"\n",
"le = preprocessing.LabelEncoder()\n",
"\n",
"X_Train.Country = le.fit_transform(X_Train.Country)\n",
"X_Train['State'] = le.fit_transform(X_Train['State'])\n",
"\n",
"X_Train.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ForecastId | \n",
" State | \n",
" Country | \n",
" Date | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 319 | \n",
"
\n",
" \n",
" 1 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 320 | \n",
"
\n",
" \n",
" 2 | \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 321 | \n",
"
\n",
" \n",
" 3 | \n",
" 4 | \n",
" 0 | \n",
" 0 | \n",
" 322 | \n",
"
\n",
" \n",
" 4 | \n",
" 5 | \n",
" 0 | \n",
" 0 | \n",
" 323 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ForecastId State Country Date\n",
"0 1 0 0 319\n",
"1 2 0 0 320\n",
"2 3 0 0 321\n",
"3 4 0 0 322\n",
"4 5 0 0 323"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_Test.Country = le.fit_transform(X_Test.Country)\n",
"X_Test['State'] = le.fit_transform(X_Test['State'])\n",
"\n",
"X_Test.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import GridSearchCV \n",
"import time \n",
"param_grid = {'n_estimators': [1000]}\n",
"\n",
"def gridSearchCV(model, X_Train, y_Train, param_grid, cv=10, scoring='neg_mean_squared_error'): \n",
" start = time.time()"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from xgboost import XGBRegressor\n",
"\n",
"model = XGBRegressor()\n",
"\n",
"model1 = gridSearchCV(model, X_Train, y1_Train, param_grid, 10, 'neg_mean_squared_error') \n",
"model2 = gridSearchCV(model, X_Train, y2_Train, param_grid, 10, 'neg_mean_squared_error')"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import preprocessing\n",
"\n",
"le = preprocessing.LabelEncoder()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"from xgboost import XGBRegressor\n",
"\n",
"countries = X_Train.Country.unique()\n",
"\n",
"#models_C = {}\n",
"#models_F = {}\n",
"\n",
"df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})\n",
"\n",
"for country in countries:\n",
" states = X_Train.loc[X_Train.Country == country, :].State.unique()\n",
" #print(country, states)\n",
" # check whether string is nan or not\n",
" for state in states:\n",
" X_Train_CS = X_Train.loc[(X_Train.Country == country) & (X_Train.State == state), ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']]\n",
" \n",
" y1_Train_CS = X_Train_CS.loc[:, 'ConfirmedCases']\n",
" y2_Train_CS = X_Train_CS.loc[:, 'Fatalities']\n",
" \n",
" X_Train_CS = X_Train_CS.loc[:, ['State', 'Country', 'Date']]\n",
" \n",
" X_Train_CS.Country = le.fit_transform(X_Train_CS.Country)\n",
" X_Train_CS['State'] = le.fit_transform(X_Train_CS['State'])\n",
" \n",
" X_Test_CS = X_Test.loc[(X_Test.Country == country) & (X_Test.State == state), ['State', 'Country', 'Date', 'ForecastId']]\n",
" \n",
" X_Test_CS_Id = X_Test_CS.loc[:, 'ForecastId']\n",
" X_Test_CS = X_Test_CS.loc[:, ['State', 'Country', 'Date']]\n",
" \n",
" X_Test_CS.Country = le.fit_transform(X_Test_CS.Country)\n",
" X_Test_CS['State'] = le.fit_transform(X_Test_CS['State'])\n",
" \n",
" #models_C[country] = gridSearchCV(model, X_Train_CS, y1_Train_CS, param_grid, 10, 'neg_mean_squared_error')\n",
" #models_F[country] = gridSearchCV(model, X_Train_CS, y2_Train_CS, param_grid, 10, 'neg_mean_squared_error')\n",
" \n",
" model1 = XGBRegressor(n_estimators=1000)\n",
" model1.fit(X_Train_CS, y1_Train_CS)\n",
" y1_pred = model1.predict(X_Test_CS)\n",
" \n",
" model2 = XGBRegressor(n_estimators=1000)\n",
" model2.fit(X_Train_CS, y2_Train_CS)\n",
" y2_pred = model2.predict(X_Test_CS)\n",
" \n",
" df = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y1_pred, 'Fatalities': y2_pred})\n",
" df_out = pd.concat([df_out, df], axis=0)\n",
" # Done for state loop\n",
"# Done for country Loop"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ForecastId | \n",
" ConfirmedCases | \n",
" Fatalities | \n",
"
\n",
" \n",
" \n",
" \n",
" 12637 | \n",
" 12638 | \n",
" 6.998692 | \n",
" 0.999658 | \n",
"
\n",
" \n",
" 12638 | \n",
" 12639 | \n",
" 6.998692 | \n",
" 0.999658 | \n",
"
\n",
" \n",
" 12639 | \n",
" 12640 | \n",
" 6.998692 | \n",
" 0.999658 | \n",
"
\n",
" \n",
" 12640 | \n",
" 12641 | \n",
" 6.998692 | \n",
" 0.999658 | \n",
"
\n",
" \n",
" 12641 | \n",
" 12642 | \n",
" 6.998692 | \n",
" 0.999658 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ForecastId ConfirmedCases Fatalities\n",
"12637 12638 6.998692 0.999658\n",
"12638 12639 6.998692 0.999658\n",
"12639 12640 6.998692 0.999658\n",
"12640 12641 6.998692 0.999658\n",
"12641 12642 6.998692 0.999658"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_out.ForecastId = df_out.ForecastId.astype('int')\n",
"df_out.tail()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"df_out.to_csv('submission.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.6"
}
},
"nbformat": 4,
"nbformat_minor": 4
}