## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string

from tabulate import tabulate
import numpy as np

# unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')

vectorizers = [
    CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english'),
    CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
#     CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english'),
#     CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=LemmaTokenizer()),
#     CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=my_tokenizer ),
#     TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english'),
#     TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, max_df=0.50, stop_words='english'),
#     TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
]

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    mnb_clf = MultinomialNB()
    mnb_clf.fit(X_train_vec, y_train)
    print('*****MNB*****')
    y_pred = mnb_clf.predict(X_test_vec)
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    print('*****CONFUSION MATRIX*****')
    print(cm)
    
    target_names = target_names
    index = target_names
    columns = target_names 
    cm_df = pd.DataFrame(cm,columns,index)                      
    plt.figure(figsize=(16,10))  
    sns.heatmap(cm_df, annot=True, cmap="Blues")
    plt.show
    
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    index = target_names
    columns = target_names 
    cm_df = pd.DataFrame(cm,columns,index)                      
    plt.figure(figsize=(16,10))  
    sns.heatmap(cm_df, annot=True, cmap="Blues")
    plt.show
    
    cm2 = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#     print('*****CLASSIFICATION REPORT*****')
#     print(classification_report(y_test, y_pred, target_names=target_names))
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
#     print('TYPE', type(report))
#     print(report['0'])
    df = pd.DataFrame(report)
#     print(pd.DataFrame(report).T['recall'])
    print('BY SENTIMENT')
    print(tabulate(df[df.columns[0:4]], tablefmt="simple", headers=df.columns, floatfmt=".2f"))
    print('BY PERFORMANCE')
    print(tabulate(df[df.columns[5:8]], tablefmt="simple", headers=df.columns[5:8], floatfmt=".2f"))
    return(pd.DataFrame(report).T)
    
#     print('*****SCORES*****')
#     print(mnb_clf.score(X_test_vec, y_test))
    
def run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    svm_clf = LinearSVC(C=1)
    svm_clf.fit(X_train_vec,y_train)
    print('=====SVM=====')
    y_pred = svm_clf.predict(X_test_vec)
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    print('=====CONFUSION MATRIX=====')
    print(cm)

    target_names = target_names
    print('=====CLASSIFICATION REPORT=====')
    print(classification_report(y_test, y_pred, target_names=target_names))
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    return(pd.DataFrame(report).T)
    svm_confidence_scores = svm_clf.decision_function(X_test_vec)
    print('=====CONFIDENCE SCORES=====')
    print(svm_confidence_scores[0])
    print('=====SCORES=====')
    print(svm_clf.score(X_test_vec,y_test))
    
def do_the_thing(X,y,labels, target_names):
    for vec in vectorizers:
#         print(vec)
        X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
        run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
#         run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)

import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])

*****MNB*****
*****CONFUSION MATRIX*****
[[  733  1264   817   106    11]
 [  602  4132  5411   649    30]
 [  246  2397 25756  3226   239]
 [   19   454  5580  6248   767]
 [    1    54   725  1972   985]]
BY SENTIMENT
0                1         2         3         4
---------  -------  --------  --------  --------
precision     0.46      0.50      0.67      0.51
recall        0.25      0.38      0.81      0.48
f1-score      0.32      0.43      0.73      0.49
support    2931.00  10824.00  31864.00  13068.00
BY PERFORMANCE
             accuracy    macro avg    weighted avg
---------  ----------  -----------  --------------
precision        0.61         0.53            0.59
recall           0.61         0.44            0.61
f1-score         0.61         0.47            0.59
support          0.61     62424.00        62424.00
*****MNB*****
*****CONFUSION MATRIX*****
[[  742  1276   797   105    11]
 [  614  4126  5397   655    32]
 [  248  2385 25756  3239   236]
 [   19   456  5570  6253   770]
 [    1    53   729  1977   977]]
BY SENTIMENT
0                1         2         3         4
---------  -------  --------  --------  --------
precision     0.46      0.50      0.67      0.51
recall        0.25      0.38      0.81      0.48
f1-score      0.33      0.43      0.73      0.49
support    2931.00  10824.00  31864.00  13068.00
BY PERFORMANCE
             accuracy    macro avg    weighted avg
---------  ----------  -----------  --------------
precision        0.61         0.52            0.59
recall           0.61         0.44            0.61
f1-score         0.61         0.47            0.59
support          0.61     62424.00        62424.00

import seaborn as sns
import matplotlib.pyplot as plt
m_df = mnb_report.copy()
s_df = svm_report.copy()

def barplot_single_report(report):
    accuracy = report[5:6]['support'].values
    just_sent = report[:5]
    just_sent = just_sent[just_sent.columns[0:3]]
    just_sent.reset_index(inplace=True)
    df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.barplot(x="index",
               y="score_value",
               hue="sentiment",
               data=df);
    plt.title('ACCURACY: ' + str(accuracy))
    plt.show()

# barplot_single_report(m_df)

def barplot_both_report(m_df, s_df):
    m_df2 = m_df
    s_df2 = s_df
    m_df2['model'] = 'mnb'
    s_df2['model'] = 'svm'
    df = m_df2[:5].append(s_df2[:5])
    df.reset_index(inplace=True)
    df2 = df[['index','f1-score','model']]
    # df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.barplot(x="index",
               y="f1-score",
               hue="model",
               data=df2);
    plt.title('Comparing MNB & SVM')
    plt.show()
# barplot_both_report(m_df, s_df)

def get_report(X,y,labels, target_names):
    vec = vectorizers[0]
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    mnb_report = run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
    svm_report = run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
    barplot_single_report(mnb_report)
    barplot_single_report(svm_report)
    barplot_both_report(mnb_report, svm_report)
#     return mnb_report, svm_report

import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
# do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])
# mnb_report, svm_report = get_report(X,y,[0,1,2,3,4],['0','1','2','3','4'])
get_report(X,y,[0,1,2,3,4],['0','1','2','3','4'])

*****MNB*****
*****CONFUSION MATRIX*****
[[  733  1264   817   106    11]
 [  602  4132  5411   649    30]
 [  246  2397 25756  3226   239]
 [   19   454  5580  6248   767]
 [    1    54   725  1972   985]]
*****CLASSIFICATION REPORT*****
              precision    recall  f1-score   support

           0       0.46      0.25      0.32      2931
           1       0.50      0.38      0.43     10824
           2       0.67      0.81      0.73     31864
           3       0.51      0.48      0.49     13068
           4       0.48      0.26      0.34      3737

    accuracy                           0.61     62424
   macro avg       0.53      0.44      0.47     62424
weighted avg       0.59      0.61      0.59     62424

TYPE <class 'dict'>
{'precision': 0.45783885071830105, 'recall': 0.25008529512111904, 'f1-score': 0.323477493380406, 'support': 2931}
0               0.250085
1               0.381744
2               0.808310
3               0.478114
4               0.263580
accuracy        0.606401
macro avg       0.436367
weighted avg    0.606401
Name: recall, dtype: float64
=====SVM=====
=====CONFUSION MATRIX=====
[[  913  1229   696    79    14]
 [  705  4094  5472   527    26]
 [  190  2111 27063  2324   176]
 [   33   394  6011  5568  1062]
 [    3    51   582  1775  1326]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.50      0.31      0.38      2931
           1       0.52      0.38      0.44     10824
           2       0.68      0.85      0.76     31864
           3       0.54      0.43      0.48     13068
           4       0.51      0.35      0.42      3737

    accuracy                           0.62     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.60      0.62      0.60     62424

import seaborn as sns
import matplotlib.pyplot as plt
m_df = mnb_report.copy()
s_df = svm_report.copy()

def barplot_single_report(report):
    accuracy = report[5:6]['support'].values
    just_sent = report[:5]
    just_sent = just_sent[just_sent.columns[0:3]]
    just_sent.reset_index(inplace=True)
    df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
    sns.barplot(x="index",
               y="score_value",
               hue="sentiment",
               data=df);
    plt.title('ACCURACY: ' + str(accuracy))

barplot_single_report(m_df)
    
def barplot_both_report(m_df, s_df):
    m_df2 = m_df
    s_df2 = s_df
    m_df2['model'] = 'mnb'
    s_df2['model'] = 'svm'
    df = m_df2[:5].append(s_df2[:5])
    df.reset_index(inplace=True)
    df2 = df[['index','f1-score','model']]
    # df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
    sns.barplot(x="index",
               y="f1-score",
               hue="model",
               data=df2);
barplot_both_report(m_df, s_df)

barplot_single_report(s_df)

m_df[5:6]['support']

accuracy    0.606401
Name: support, dtype: float64

m_df2 = m_df
s_df2 = s_df
m_df2['model'] = 'mnb'
s_df2['model'] = 'svm'
df = m_df2[:5].append(s_df2[:5])
df.reset_index(inplace=True)
df2 = df[['index','f1-score','model']]
df2
# df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
sns.barplot(x="index",
           y="f1-score",
           hue="model",
           data=df2);

def barplot_both_report(m_df, s_df):
    m_df2 = m_df
    s_df2 = s_df
    m_df2['model'] = 'mnb'
    s_df2['model'] = 'svm'
    df = m_df2[:5].append(s_df2[:5])
    df.reset_index(inplace=True)
    df2 = df[['index','f1-score','model']]
    # df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
    sns.barplot(x="index",
               y="f1-score",
               hue="model",
               data=df2);
barplot_both_report(m_df, s_df)

m_df = mnb_report.copy()
s_df = svm_report.copy()
# mnb_report.T
# df = mnb_report.copy()
import seaborn as sns
import matplotlib.pyplot as plt


# df = df.drop('support', axis=1)

just_sent = s_df[:5]

just_sent = just_sent[just_sent.columns[0:3]]
just_sent.reset_index(inplace=True)
just_sent

df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
df
# sns.barplot(data = df)
sns.barplot(x="index",
           y="score_value",
           hue="sentiment",
           data=df);

mnb_report
df = mnb_report.copy().drop('support', axis=1)
df

# df2 = agree_df.groupby(['agree_factor', 'PoN']).count()
# df2.reset_index(inplace=True)

mnb_report['model'] = 'mnb'
svm_report['model'] = 'svm'
all_df = mnb_report.append(svm_report)
df2 = all_df.groupby(['model'])
df2 = pd.DataFrame(df2)
# sns.barplot(data = df2)
df2

df3 = mnb_report.T.reset_index()

# sns.barplot(data = df3)
df3 = mnb_report.T.reset_index()[:3]
df3
df4 = pd.melt(df3, id_vars="index", var_name="sentiment", value_name="score_value")
# sns.barplot(data = df4

sns.barplot(x="index",
           y="score_value",
           hue="sentiment",
           data=df4);
# df4 = pd.melt(df3, id_vars="", var_names="", value_name=" ")

sns.barplot(x="sentiment",
           y="score_value",
           hue="index",
           data=df4);

df5 = df4[df4.columns[0:3]]

df5

	precision	recall	f1-score	model
0	0.457839	0.250085	0.323477	mnb
1	0.497771	0.381744	0.432105	mnb
2	0.672674	0.808310	0.734281	mnb
3	0.512089	0.478114	0.494519	mnb
4	0.484744	0.263580	0.341480	mnb
accuracy	0.606401	0.606401	0.606401	mnb
macro avg	0.525023	0.436367	0.465172	mnb
weighted avg	0.587392	0.606401	0.588889	mnb

	0	1
0	mnb	precision recall f1-score ...
1	svm	precision recall f1-score ...

	index	sentiment	score_value
0	precision	0	0.457839
1	recall	0	0.250085
2	f1-score	0	0.323477
3	precision	1	0.497771
4	recall	1	0.381744
5	f1-score	1	0.432105
6	precision	2	0.672674
7	recall	2	0.80831
8	f1-score	2	0.734281
9	precision	3	0.512089
10	recall	3	0.478114
11	f1-score	3	0.494519
12	precision	4	0.484744
13	recall	4	0.26358
14	f1-score	4	0.34148
15	precision	accuracy	0.606401
16	recall	accuracy	0.606401
17	f1-score	accuracy	0.606401
18	precision	macro avg	0.525023
19	recall	macro avg	0.436367
20	f1-score	macro avg	0.465172
21	precision	weighted avg	0.587392
22	recall	weighted avg	0.606401
23	f1-score	weighted avg	0.588889