(simpliest submission possible, no modeling)
import pandas as pd
import numpy as np
## GLOBAL WK 2
train_file = "https://raw.githubusercontent.com/danielcaraway/data/master/covid19-global-forecasting-week-2/train.csv"
test_file = "https://raw.githubusercontent.com/danielcaraway/data/master/covid19-global-forecasting-week-2/test.csv"
sub_file = "https://raw.githubusercontent.com/danielcaraway/data/master/covid19-global-forecasting-week-2/submission.csv"
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
sub = pd.read_csv(sub_file)
# BEFORE
sub.head()
# AFTER
sub['ConfirmedCases'] = 100
sub['Fatalities'] = 18
sub.head()
sub.to_csv('submission', index=False)
from google.colab import files
files.download("submission.csv")
Ok so what's the problem with this picture? Well, we're saying that every single country on every single day has exactly 100 cases of COVID and exactly 18 deaths. BUT, we have proved we can manipulate the submission DF so we've got that going for us which is nice.
Now, it would also be nice to actually take into account the country and the date, right?
merged = pd.merge(sub, test, on="ForecastId", how="left")
df = merged.copy()
df
OK great! Now we have the country AND the date with our corecast ID!! So we know we can successfully merge our testing df into our submission df.
But... our ConfirmedCases and Fatalities are still 100 and 18 without regard to the country...
df['Date'] = pd.to_datetime(df['Date'])
df['days_from'] = df['Date'] - (df['Date'].min())
df['days_from'] = df['days_from'] / np.timedelta64(1, 'D')
df['CC_v2'] = df.apply(lambda x: x['days_from']*x['days_from'] , axis=1)
df['F_v2'] = df.apply(lambda x: x['days_from'] * 2 , axis=1)
df
OK great! Now each country is different! I call this VAMPIRE DATA where the number of people bitten (infected) is logarathmic and the number of deaths is simply linear (because not everyone dies from bites, duh)
spain = df[df['Country_Region'] == 'Spain']
spain.head()
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
graph_df = df[['days_from', 'CC_v2', 'F_v2']]
data = pd.melt(graph_df, id_vars=['days_from'], value_vars=['CC_v2','F_v2'])
data.head()
ax = sns.lineplot(x="days_from", y="value",
hue="variable", style="variable", data=data)
import pandas as pd
import numpy as np
## GLOBAL WK 2
train_file = "https://raw.githubusercontent.com/danielcaraway/data/master/covid19-global-forecasting-week-2/train.csv"
test_file = "https://raw.githubusercontent.com/danielcaraway/data/master/covid19-global-forecasting-week-2/test.csv"
sub_file = "https://raw.githubusercontent.com/danielcaraway/data/master/covid19-global-forecasting-week-2/submission.csv"
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)
sub = pd.read_csv(sub_file)
# subset = train.sample(n=500)
# subset
# train = subset.copy()
def use_country(state, country):
if pd.isna(state):
return country
else:
return state
train['Province_State'] = train.apply(lambda x: use_country(x['Province_State'], x['Country_Region']), axis=1)
test['Province_State'] = test.apply(lambda x: use_country(x['Province_State'], x['Country_Region']), axis=1)
train_d = pd.get_dummies(train)
test_d = pd.get_dummies(test)
train_dummies
from sklearn.model_selection import GridSearchCV
import time
param_grid = {'n_estimators': [1000]}
def gridSearchCV(model, X_Train, y_Train, param_grid, cv=10, scoring='neg_mean_squared_error'):
start = time.time()
X_Train = train.copy()
y1_Train = X_Train['ConfirmedCases']
y2_Train = X_Train['Fatalities']
from xgboost import XGBRegressor
model = XGBRegressor()
model1 = gridSearchCV(model, X_Train, y1_Train, param_grid, 10, 'neg_mean_squared_error')
model2 = gridSearchCV(model, X_Train, y2_Train, param_grid, 10, 'neg_mean_squared_error')
countries = set(X_Train['Country_Region'])
#models_C = {}
#models_F = {}
df_out = pd.DataFrame({'ForecastId': [], 'ConfirmedCases': [], 'Fatalities': []})
for country in countries:
states = set(X_Train['Province_State'])
# states = X_Train.loc[X_Train.Country == country, :].State.unique()
#print(country, states)
# check whether string is nan or not
for state in states:
X_Train_CS = X_Train.loc[(X_Train.Country == country) & (X_Train.State == state), ['State', 'Country', 'Date', 'ConfirmedCases', 'Fatalities']]
y1_Train_CS = X_Train_CS.loc[:, 'ConfirmedCases']
y2_Train_CS = X_Train_CS.loc[:, 'Fatalities']
X_Train_CS = X_Train_CS.loc[:, ['State', 'Country', 'Date']]
X_Train_CS.Country = le.fit_transform(X_Train_CS.Country)
X_Train_CS['State'] = le.fit_transform(X_Train_CS['State'])
X_Test_CS = X_Test.loc[(X_Test.Country == country) & (X_Test.State == state), ['State', 'Country', 'Date', 'ForecastId']]
X_Test_CS_Id = X_Test_CS.loc[:, 'ForecastId']
X_Test_CS = X_Test_CS.loc[:, ['State', 'Country', 'Date']]
X_Test_CS.Country = le.fit_transform(X_Test_CS.Country)
X_Test_CS['State'] = le.fit_transform(X_Test_CS['State'])
#models_C[country] = gridSearchCV(model, X_Train_CS, y1_Train_CS, param_grid, 10, 'neg_mean_squared_error')
#models_F[country] = gridSearchCV(model, X_Train_CS, y2_Train_CS, param_grid, 10, 'neg_mean_squared_error')
model1 = XGBRegressor(n_estimators=1000)
model1.fit(X_Train_CS, y1_Train_CS)
y1_pred = model1.predict(X_Test_CS)
model2 = XGBRegressor(n_estimators=1000)
model2.fit(X_Train_CS, y2_Train_CS)
y2_pred = model2.predict(X_Test_CS)
df = pd.DataFrame({'ForecastId': X_Test_CS_Id, 'ConfirmedCases': y1_pred, 'Fatalities': y2_pred})
df_out = pd.concat([df_out, df], axis=0)
# Done for state loop
# Done for country Loop
b_train = train.copy()
b_test = test.copy()
b_train['Date'].min()
b_train_ca = b_train[b_train['Province_State'] == 'California']
b_train = b_train_ca.copy()
b_train['Date'] = pd.to_datetime(b_train['Date'])
b_train['days_from'] = b_train['Date'] - (b_train['Date'].min())
b_train['days_from'] = b_train['days_from'] / np.timedelta64(1, 'D')
b_train
b_train_y1 = b_train['ConfirmedCases']
b_train_y2 = b_train['Fatalities']
# b_train_X = b_train.drop(['ConfirmedCases','Fatalities'], axis=1)
b_train_X = b_train.drop(['Fatalities', 'Date'], axis=1)
b_train_X
## CA TEST
# b_train_X_ca = b_train_X[b_train_X['Province_State'] == 'California']
# b_train_X = b_train_X_ca.copy()
b_train_X_d = pd.get_dummies(b_train_X)
# b_train_X_d['']
import xgboost as xgb
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
boston = load_boston()
# x, y = boston.data, boston.target
x,y = b_train_X_d, b_train_y1
# x,y = b_train_X_d, b_train_y2
xtrain, xtest, ytrain, ytest=train_test_split(x, y, test_size=0.15)
xgbr = xgb.XGBRegressor()
print(xgbr)
xgbr.fit(xtrain, ytrain)
# - cross validataion
scores = cross_val_score(xgbr, xtrain, ytrain, cv=5)
print("Mean cross-validation score: %.2f" % scores.mean())
kfold = KFold(n_splits=10, shuffle=True)
kf_cv_scores = cross_val_score(xgbr, xtrain, ytrain, cv=kfold )
print("K-fold CV average score: %.2f" % kf_cv_scores.mean())
ypred = xgbr.predict(xtest)
mse = mean_squared_error(ytest, ypred)
print("MSE: %.2f" % mse)
print("RMSE: %.2f" % np.sqrt(mse))
x_ax = range(len(ytest))
plt.scatter(x_ax, ytest, s=5, color="blue", label="original")
plt.plot(x_ax, ypred, lw=0.8, color="red", label="predicted")
plt.legend()
plt.show()
# BUILDING FORCAST ID:
boston.data
boston.data.shape
boston.target.shape
print(ypred)
train.head()
test.head(50)
sub.head()
countries = set(train['Country_Region'])
states = set(train['Province_State'])
for country in countries:
# train model
# run model
# make predictions
# print predictions
from fbprophet import Prophet
def get_prof_preds_for(df, n):
m = Prophet(daily_seasonality=True)
m.fit(df)
future = m.make_future_dataframe(periods=n)
forecast = m.predict(future)
return forecast
# fig1 = m.plot(forecast)
sm = train[['Date','ConfirmedCases']]
sm.columns = ['ds', 'y']
get_prof_preds_for(sm, 43)
big_df = pd.DataFrame()
for country in list(countries)[:3]:
df = train[train['Country_Region'] == country]
sm = df[['Date','ConfirmedCases']]
sm.columns = ['ds', 'y']
results = get_prof_preds_for(sm, 30)
new_df = results[['ds', 'trend']]
new_df['country'] = country
big_df = big_df.append(new_df)
print(results)
# train model
# run model
# make predictions
# print predictions
big_df
df