import pandas as pd
import numpy as np
## GLOBAL WK 2
train_file = "https://raw.githubusercontent.com/danielcaraway/data/master/covid19-global-forecasting-week-2/train.csv"
test_file = "https://raw.githubusercontent.com/danielcaraway/data/master/covid19-global-forecasting-week-2/test.csv"
sub_file = "https://raw.githubusercontent.com/danielcaraway/data/master/covid19-global-forecasting-week-2/submission.csv"
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
sub = pd.read_csv(sub_file)
subset = train.sample(n=500)
subset
# train = subset.copy()
def use_country(state, country):
if pd.isna(state):
return country
else:
return state
train['Province_State'] = train.apply(lambda x: use_country(x['Province_State'], x['Country_Region']), axis=1)
test['Province_State'] = test.apply(lambda x: use_country(x['Province_State'], x['Country_Region']), axis=1)
train_d = pd.get_dummies(train)
test_d = pd.get_dummies(test)
train_dummies
from sklearn.model_selection import GridSearchCV
import time
param_grid = {'n_estimators': [1000]}
def gridSearchCV(model, X_Train, y_Train, param_grid, cv=10, scoring='neg_mean_squared_error'):
start = time.time()
X_Train = train.copy()
y1_Train = X_Train['ConfirmedCases']
y2_Train = X_Train['Fatalities']
from xgboost import XGBRegressor
model = XGBRegressor()
model1 = gridSearchCV(model, X_Train, y1_Train, param_grid, 10, 'neg_mean_squared_error')
model2 = gridSearchCV(model, X_Train, y2_Train, param_grid, 10, 'neg_mean_squared_error')
countries = set(X_Train['Country_Region'])
#models_C = {}
#models_F = {}
df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})
for country in countries:
states = set(X_Train['Province_State'])
# states = X_Train.loc[X_Train.Country == country, :].State.unique()
#print(country, states)
# check whether string is nan or not
for state in states:
X_Train_CS = X_Train.loc[(X_Train.Country == country) & (X_Train.State == state), ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']]
y1_Train_CS = X_Train_CS.loc[:, 'ConfirmedCases']
y2_Train_CS = X_Train_CS.loc[:, 'Fatalities']
X_Train_CS = X_Train_CS.loc[:, ['State', 'Country', 'Date']]
X_Train_CS.Country = le.fit_transform(X_Train_CS.Country)
X_Train_CS['State'] = le.fit_transform(X_Train_CS['State'])
X_Test_CS = X_Test.loc[(X_Test.Country == country) & (X_Test.State == state), ['State', 'Country', 'Date', 'ForecastId']]
X_Test_CS_Id = X_Test_CS.loc[:, 'ForecastId']
X_Test_CS = X_Test_CS.loc[:, ['State', 'Country', 'Date']]
X_Test_CS.Country = le.fit_transform(X_Test_CS.Country)
X_Test_CS['State'] = le.fit_transform(X_Test_CS['State'])
#models_C[country] = gridSearchCV(model, X_Train_CS, y1_Train_CS, param_grid, 10, 'neg_mean_squared_error')
#models_F[country] = gridSearchCV(model, X_Train_CS, y2_Train_CS, param_grid, 10, 'neg_mean_squared_error')
model1 = XGBRegressor(n_estimators=1000)
model1.fit(X_Train_CS, y1_Train_CS)
y1_pred = model1.predict(X_Test_CS)
model2 = XGBRegressor(n_estimators=1000)
model2.fit(X_Train_CS, y2_Train_CS)
y2_pred = model2.predict(X_Test_CS)
df = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y1_pred, 'Fatalities': y2_pred})
df_out = pd.concat([df_out, df], axis=0)
# Done for state loop
# Done for country Loop
b_train = train.copy()
b_test = test.copy()
b_train_y1 = b_train['ConfirmedCases']
b_train_y2 = b_train['Fatalities']
b_train_X = b_train.drop(['ConfirmedCases','Fatalities'], axis=1)
b_train_X
b_train_X_d = pd.get_dummies(b_train_X)
b_train_X_d
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
boston = load_boston()
# x, y = boston.data, boston.target
x,y = b_train_X_d, b_train_y1
xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.15)
xgbr = xgb.XGBRegressor()
print(xgbr)
xgbr.fit(xtrain, ytrain)
# - cross validataion
scores = cross_val_score(xgbr, xtrain, ytrain, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbr, xtrain, ytrain, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())
ypred = xgbr.predict(xtest)
mse = mean_squared_error(ytest, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % np.sqrt(mse))
x_ax = range(len(ytest))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()
boston.data
boston.data.shape
boston.target.shape