{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/kaggle/input/covid19-global-forecasting-week-2/test.csv\n", "/kaggle/input/covid19-global-forecasting-week-2/train.csv\n", "/kaggle/input/covid19-global-forecasting-week-2/submission.csv\n" ] } ], "source": [ "# This Python 3 environment comes with many helpful analytics libraries installed\n", "# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python\n", "# For example, here's several helpful packages to load in \n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "# Input data files are available in the \"../input/\" directory.\n", "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", "\n", "import os\n", "for dirname, _, filenames in os.walk('/kaggle/input'):\n", " for filename in filenames:\n", " print(os.path.join(dirname, filename))\n", "\n", "# Any results you write to the current directory are saved as output." ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "_cell_guid": "79c7e3d0-c299-4dcb-8224-4455121ee9b0", "_uuid": "d629ff2d2480ee46fbb7e2d37f6b5fab8052498a" }, "outputs": [], "source": [ "df_train = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/train.csv')\n", "df_submission = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/submission.csv')\n", "df_test = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-2/test.csv')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "df_train.rename(columns={'Country_Region':'Country'}, inplace=True)\n", "df_test.rename(columns={'Country_Region':'Country'}, inplace=True)\n", "\n", "df_train.rename(columns={'Province_State':'State'}, inplace=True)\n", "df_test.rename(columns={'Province_State':'State'}, inplace=True)\n", "\n", "df_train['Date'] = pd.to_datetime(df_train['Date'], infer_datetime_format=True)\n", "df_test['Date'] = pd.to_datetime(df_test['Date'], infer_datetime_format=True)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0.0\n", "1 0.0\n", "2 0.0\n", "3 0.0\n", "4 0.0\n", "Name: Fatalities, dtype: float64" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y1_Train = df_train.iloc[:, -2]\n", "y1_Train.head()\n", "\n", "y2_Train = df_train.iloc[:, -1]\n", "y2_Train.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "EMPTY_VAL = \"EMPTY_VAL\"\n", "\n", "def fillState(state, country):\n", " if state == EMPTY_VAL: return country\n", " return state" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdStateCountryDateConfirmedCasesFatalities
01AfghanistanAfghanistan1220.00.0
12AfghanistanAfghanistan1230.00.0
23AfghanistanAfghanistan1240.00.0
34AfghanistanAfghanistan1250.00.0
45AfghanistanAfghanistan1260.00.0
\n", "
" ], "text/plain": [ " Id State Country Date ConfirmedCases Fatalities\n", "0 1 Afghanistan Afghanistan 122 0.0 0.0\n", "1 2 Afghanistan Afghanistan 123 0.0 0.0\n", "2 3 Afghanistan Afghanistan 124 0.0 0.0\n", "3 4 Afghanistan Afghanistan 125 0.0 0.0\n", "4 5 Afghanistan Afghanistan 126 0.0 0.0" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#X_Train = df_train.loc[:, ['State', 'Country', 'Date']]\n", "X_Train = df_train.copy()\n", "\n", "X_Train['State'].fillna(EMPTY_VAL, inplace=True)\n", "X_Train['State'] = X_Train.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)\n", "\n", "X_Train.loc[:, 'Date'] = X_Train.Date.dt.strftime(\"%m%d\")\n", "X_Train[\"Date\"] = X_Train[\"Date\"].astype(int)\n", "\n", "X_Train.head()" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ForecastIdStateCountryDate
01AfghanistanAfghanistan319
12AfghanistanAfghanistan320
23AfghanistanAfghanistan321
34AfghanistanAfghanistan322
45AfghanistanAfghanistan323
\n", "
" ], "text/plain": [ " ForecastId State Country Date\n", "0 1 Afghanistan Afghanistan 319\n", "1 2 Afghanistan Afghanistan 320\n", "2 3 Afghanistan Afghanistan 321\n", "3 4 Afghanistan Afghanistan 322\n", "4 5 Afghanistan Afghanistan 323" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "#X_Test = df_test.loc[:, ['State', 'Country', 'Date']]\n", "X_Test = df_test.copy()\n", "\n", "X_Test['State'].fillna(EMPTY_VAL, inplace=True)\n", "X_Test['State'] = X_Test.loc[:, ['State', 'Country']].apply(lambda x : fillState(x['State'], x['Country']), axis=1)\n", "\n", "X_Test.loc[:, 'Date'] = X_Test.Date.dt.strftime(\"%m%d\")\n", "X_Test[\"Date\"] = X_Test[\"Date\"].astype(int)\n", "\n", "X_Test.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IdStateCountryDateConfirmedCasesFatalities
01001220.00.0
12001230.00.0
23001240.00.0
34001250.00.0
45001260.00.0
\n", "
" ], "text/plain": [ " Id State Country Date ConfirmedCases Fatalities\n", "0 1 0 0 122 0.0 0.0\n", "1 2 0 0 123 0.0 0.0\n", "2 3 0 0 124 0.0 0.0\n", "3 4 0 0 125 0.0 0.0\n", "4 5 0 0 126 0.0 0.0" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn import preprocessing\n", "\n", "le = preprocessing.LabelEncoder()\n", "\n", "X_Train.Country = le.fit_transform(X_Train.Country)\n", "X_Train['State'] = le.fit_transform(X_Train['State'])\n", "\n", "X_Train.head()" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ForecastIdStateCountryDate
0100319
1200320
2300321
3400322
4500323
\n", "
" ], "text/plain": [ " ForecastId State Country Date\n", "0 1 0 0 319\n", "1 2 0 0 320\n", "2 3 0 0 321\n", "3 4 0 0 322\n", "4 5 0 0 323" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_Test.Country = le.fit_transform(X_Test.Country)\n", "X_Test['State'] = le.fit_transform(X_Test['State'])\n", "\n", "X_Test.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import GridSearchCV \n", "import time \n", "param_grid = {'n_estimators': [1000]}\n", "\n", "def gridSearchCV(model, X_Train, y_Train, param_grid, cv=10, scoring='neg_mean_squared_error'): \n", " start = time.time()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from xgboost import XGBRegressor\n", "\n", "model = XGBRegressor()\n", "\n", "model1 = gridSearchCV(model, X_Train, y1_Train, param_grid, 10, 'neg_mean_squared_error') \n", "model2 = gridSearchCV(model, X_Train, y2_Train, param_grid, 10, 'neg_mean_squared_error')" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from sklearn import preprocessing\n", "\n", "le = preprocessing.LabelEncoder()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from xgboost import XGBRegressor\n", "\n", "countries = X_Train.Country.unique()\n", "\n", "#models_C = {}\n", "#models_F = {}\n", "\n", "df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})\n", "\n", "for country in countries:\n", " states = X_Train.loc[X_Train.Country == country, :].State.unique()\n", " #print(country, states)\n", " # check whether string is nan or not\n", " for state in states:\n", " X_Train_CS = X_Train.loc[(X_Train.Country == country) & (X_Train.State == state), ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']]\n", " \n", " y1_Train_CS = X_Train_CS.loc[:, 'ConfirmedCases']\n", " y2_Train_CS = X_Train_CS.loc[:, 'Fatalities']\n", " \n", " X_Train_CS = X_Train_CS.loc[:, ['State', 'Country', 'Date']]\n", " \n", " X_Train_CS.Country = le.fit_transform(X_Train_CS.Country)\n", " X_Train_CS['State'] = le.fit_transform(X_Train_CS['State'])\n", " \n", " X_Test_CS = X_Test.loc[(X_Test.Country == country) & (X_Test.State == state), ['State', 'Country', 'Date', 'ForecastId']]\n", " \n", " X_Test_CS_Id = X_Test_CS.loc[:, 'ForecastId']\n", " X_Test_CS = X_Test_CS.loc[:, ['State', 'Country', 'Date']]\n", " \n", " X_Test_CS.Country = le.fit_transform(X_Test_CS.Country)\n", " X_Test_CS['State'] = le.fit_transform(X_Test_CS['State'])\n", " \n", " #models_C[country] = gridSearchCV(model, X_Train_CS, y1_Train_CS, param_grid, 10, 'neg_mean_squared_error')\n", " #models_F[country] = gridSearchCV(model, X_Train_CS, y2_Train_CS, param_grid, 10, 'neg_mean_squared_error')\n", " \n", " model1 = XGBRegressor(n_estimators=1000)\n", " model1.fit(X_Train_CS, y1_Train_CS)\n", " y1_pred = model1.predict(X_Test_CS)\n", " \n", " model2 = XGBRegressor(n_estimators=1000)\n", " model2.fit(X_Train_CS, y2_Train_CS)\n", " y2_pred = model2.predict(X_Test_CS)\n", " \n", " df = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y1_pred, 'Fatalities': y2_pred})\n", " df_out = pd.concat([df_out, df], axis=0)\n", " # Done for state loop\n", "# Done for country Loop" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ForecastIdConfirmedCasesFatalities
12637126386.9986920.999658
12638126396.9986920.999658
12639126406.9986920.999658
12640126416.9986920.999658
12641126426.9986920.999658
\n", "
" ], "text/plain": [ " ForecastId ConfirmedCases Fatalities\n", "12637 12638 6.998692 0.999658\n", "12638 12639 6.998692 0.999658\n", "12639 12640 6.998692 0.999658\n", "12640 12641 6.998692 0.999658\n", "12641 12642 6.998692 0.999658" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_out.ForecastId = df_out.ForecastId.astype('int')\n", "df_out.tail()" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "df_out.to_csv('submission.csv', index=False)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 4 }