## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string

from tabulate import tabulate
import numpy as np

# unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')

vectorizers = [
    CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english'),
    CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english'),
    CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english'),
#     CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=LemmaTokenizer()),
#     CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=my_tokenizer ),
    TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english'),
    TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, max_df=0.50, stop_words='english'),
    TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
]


def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def make_pretty_cm(y_test, y_pred, labels, target_names):
    print('Confusion Matrices: Non-normalized and Normalized')
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    
    plt.figure(figsize=(18,9))    
    target_names = target_names
    index = target_names
    columns = target_names 
    cm_df = pd.DataFrame(cm,columns,index)                      
    plt.subplot(1, 2, 1)  
    sns.heatmap(cm_df, annot=True, cmap="Blues")
    
    cm_n = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm_df = pd.DataFrame(cm_n,columns,index)                      
    plt.subplot(1, 2, 2)  
    sns.heatmap(cm_df, annot=True, cmap="Blues")
    plt.show()

import seaborn as sns
import matplotlib.pyplot as plt
# m_df = mnb_report.copy()
# s_df = svm_report.copy()

def barplot_all(report, title):
    report.drop('support', inplace=True, axis=1)
    report = report.T.reset_index()
#     report = report.T
#     report.reset_index(inplace=True)
#     df5 = df4[df4.columns[0:3]]
    df4 = pd.melt(report[report.columns[0:6]], id_vars="index", var_name="sentiment", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.set_palette(sns.color_palette("bright"))
    sns.barplot(x="index",
               y="score_value",
               hue="sentiment",
               data=df4);
    plt.title(title)
    plt.xlabel('Metrics by Score')
    plt.show()
    
    
#     print('THIS THIS THIS', report[report.columns[6:9]])
    
    df2 = report[report.columns[6:9]]
    df2.reset_index(inplace=True)
    df3 = pd.melt(df2, id_vars="index", var_name="metrics", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.set_palette(sns.color_palette("bright"))
    sns.barplot(x="index",
               y="score_value",
               hue="metrics",
               data=df3);
    plt.title(title)
    plt.xlabel('Scores by Metric')
    plt.show()
    
def barplot_single_report(report):
    accuracy = report[5:6]['support'].values
    just_sent = report[:5]
    just_sent = just_sent[just_sent.columns[0:3]]
    just_sent.reset_index(inplace=True)
#     just_sent.rename(columns={'Index':'Sentiment'}, inplace=True)
    df = pd.melt(just_sent, id_vars="index", var_name="metrics", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.barplot(x="index",
               y="score_value",
               hue="metrics",
               data=df);
    plt.title('ACCURACY: ' + str(accuracy))
    plt.xlabel('Sentiment Score')
    plt.show()

def barplot_both_report(m_df, s_df):
    m_df2 = m_df
    s_df2 = s_df
    m_df2['model'] = 'mnb'
    s_df2['model'] = 'svm'
    df = m_df2[:5].append(s_df2[:5])
    df.reset_index(inplace=True)
    df2 = df[['index','f1-score','model']]
    # df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.barplot(x="index",
               y="f1-score",
               hue="model",
               data=df2);
    plt.title('Comparing MNB & SVM')
    plt.xlabel('Sentiment Score')
    plt.show()

def run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    print('----'*10)
    print('##MNB')
    print('----'*10)
    
    mnb_clf = MultinomialNB()
    mnb_clf.fit(X_train_vec, y_train)
    y_pred = mnb_clf.predict(X_test_vec)
    
    make_pretty_cm(y_test, y_pred, labels, target_names)

    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)

    df = pd.DataFrame(report)
    print('======'*10)
    print('BY SENTIMENT')
    print('======'*10)
#     show_feature_table(df[df.columns[0:5]], 'BY SENTIMENT')
    print(tabulate(df[df.columns[0:5]], tablefmt="fancy_grid", headers=df.columns[0:5], floatfmt=".2f"))
    print('======'*10)
    print('BY PERFORMANCE')
    print('======'*10)
    print(tabulate(df[df.columns[5:8]], tablefmt="fancy_grid", headers=df.columns[5:8], floatfmt=".2f"))
    
    barplot_single_report(pd.DataFrame(report).T)
    barplot_all(pd.DataFrame(report).T, 'MNB')
    return pd.DataFrame(report).T, mnb_clf
    
def show_feature_table(df_feats, title):
    plt.figure()
    fig, ax = plt.subplots()
    # hide axes
    fig.patch.set_visible(False)
    ax.axis('off')
    ax.axis('tight')

    df = pd.DataFrame(df_feats)
    ax.table(cellText=df.values, colLabels=df.columns, loc='center')

    fig.tight_layout()
    plt.title(title)
    plt.show()
    
    
def run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    print('----'*10)
    print('##SVM')
    print('----'*10)
    
    svm_clf = LinearSVC(C=1)
    svm_clf.fit(X_train_vec,y_train)
    y_pred = svm_clf.predict(X_test_vec)

    target_names = target_names
    make_pretty_cm(y_test, y_pred, labels, target_names)
    
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)

    df = pd.DataFrame(report)
    print('======'*10)
    print('BY SENTIMENT')
    print('======'*10)
#     show_feature_table(df[df.columns[0:5]], 'BY SENTIMENT')
    print(tabulate(df[df.columns[0:5]], tablefmt="fancy_grid", headers=df.columns[0:5], floatfmt=".2f"))
    print('======'*10)
    print('BY PERFORMANCE')
    print('======'*10)
    print(tabulate(df[df.columns[5:8]], tablefmt="fancy_grid", headers=df.columns[5:8], floatfmt=".2f"))
    
    barplot_single_report(pd.DataFrame(report).T)
    barplot_all(pd.DataFrame(report).T, 'SVM')
    return pd.DataFrame(report).T, svm_clf


def get_features(vec, thingy):
    feature_ranks = sorted(zip(thingy.coef_[0], vec.get_feature_names()))

    very_negative_10 = feature_ranks[-10:]
#     print("Very negative words")
    vn = []
    for i in range(0, len(very_negative_10)):
        vn.append(very_negative_10[i])
#         print(very_negative_10[i])
    df_neg = pd.DataFrame(vn)
    show_feature_table(df_neg, 'Most Negative Words')
#     print()

    not_very_negative_10 = feature_ranks[:10]
#     print("Not very negative words")
    nvn = []
    for i in range(0, len(not_very_negative_10)):
#         print(not_very_negative_10[i])
        nvn.append(not_very_negative_10[i])
    df_pos = pd.DataFrame(nvn)
#     print(df_n)
    
    show_feature_table(df_pos, 'Most Positive Words')

    
def do_the_thing(X,y,labels, target_names):
    all_reports = []
    for i,vec in enumerate(vectorizers):
        

        
        params = vec.get_params()
        df = pd.DataFrame([params]).T
        vec_type = str(vec).split('(')[0]
        title = 'Vectorizer Settings for '+ str(i)+ '_' + vec_type
        print(title)
        df.reset_index(inplace=True)
        
        show_feature_table(df, '')

#         print(tabulate(df, tablefmt="fancy_grid", headers=df.columns, floatfmt=".2f"))

        X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
        mnb_report, mnb_clf = run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
        svm_report, svm_clf = run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
        barplot_both_report(mnb_report, svm_report)
        
        print('MNB FEATURES******************************')
        get_features(vec, mnb_clf)
        print('SVM FEATURES******************************')
        get_features(vec, svm_clf)
        all_reports.append({ i : {'mnb': mnb_report, 'svm': svm_report} })
    return all_reports

import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
all_reports = do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])

Vectorizer Settings for 0_CountVectorizer

<Figure size 432x288 with 0 Axes>

----------------------------------------
##MNB
----------------------------------------
Confusion Matrices: Non-normalized and Normalized

============================================================
BY SENTIMENT
============================================================
╒═══════════╤═════════╤══════════╤══════════╤══════════╤═════════╕
│           │       0 │        1 │        2 │        3 │       4 │
╞═══════════╪═════════╪══════════╪══════════╪══════════╪═════════╡
│ precision │    0.46 │     0.50 │     0.67 │     0.51 │    0.48 │
├───────────┼─────────┼──────────┼──────────┼──────────┼─────────┤
│ recall    │    0.25 │     0.38 │     0.81 │     0.48 │    0.26 │
├───────────┼─────────┼──────────┼──────────┼──────────┼─────────┤
│ f1-score  │    0.32 │     0.43 │     0.73 │     0.49 │    0.34 │
├───────────┼─────────┼──────────┼──────────┼──────────┼─────────┤
│ support   │ 2931.00 │ 10824.00 │ 31864.00 │ 13068.00 │ 3737.00 │
╘═══════════╧═════════╧══════════╧══════════╧══════════╧═════════╛
============================================================
BY PERFORMANCE
============================================================
╒═══════════╤════════════╤═════════════╤════════════════╕
│           │   accuracy │   macro avg │   weighted avg │
╞═══════════╪════════════╪═════════════╪════════════════╡
│ precision │       0.61 │        0.53 │           0.59 │
├───────────┼────────────┼─────────────┼────────────────┤
│ recall    │       0.61 │        0.44 │           0.61 │
├───────────┼────────────┼─────────────┼────────────────┤
│ f1-score  │       0.61 │        0.47 │           0.59 │
├───────────┼────────────┼─────────────┼────────────────┤
│ support   │       0.61 │    62424.00 │       62424.00 │
╘═══════════╧════════════╧═════════════╧════════════════╛

----------------------------------------
##SVM
----------------------------------------
Confusion Matrices: Non-normalized and Normalized

============================================================
BY SENTIMENT
============================================================
╒═══════════╤═════════╤══════════╤══════════╤══════════╤═════════╕
│           │       0 │        1 │        2 │        3 │       4 │
╞═══════════╪═════════╪══════════╪══════════╪══════════╪═════════╡
│ precision │    0.50 │     0.52 │     0.68 │     0.54 │    0.51 │
├───────────┼─────────┼──────────┼──────────┼──────────┼─────────┤
│ recall    │    0.31 │     0.38 │     0.85 │     0.43 │    0.35 │
├───────────┼─────────┼──────────┼──────────┼──────────┼─────────┤
│ f1-score  │    0.38 │     0.44 │     0.76 │     0.48 │    0.42 │
├───────────┼─────────┼──────────┼──────────┼──────────┼─────────┤
│ support   │ 2931.00 │ 10824.00 │ 31864.00 │ 13068.00 │ 3737.00 │
╘═══════════╧═════════╧══════════╧══════════╧══════════╧═════════╛
============================================================
BY PERFORMANCE
============================================================
╒═══════════╤════════════╤═════════════╤════════════════╕
│           │   accuracy │   macro avg │   weighted avg │
╞═══════════╪════════════╪═════════════╪════════════════╡
│ precision │       0.62 │        0.55 │           0.60 │
├───────────┼────────────┼─────────────┼────────────────┤
│ recall    │       0.62 │        0.46 │           0.62 │
├───────────┼────────────┼─────────────┼────────────────┤
│ f1-score  │       0.62 │        0.49 │           0.60 │
├───────────┼────────────┼─────────────┼────────────────┤
│ support   │       0.62 │    62424.00 │       62424.00 │
╘═══════════╧════════════╧═════════════╧════════════════╛

MNB FEATURES******************************

<Figure size 432x288 with 0 Axes>

<Figure size 432x288 with 0 Axes>

SVM FEATURES******************************

all_reports[0][0]['mnb']

vec_strings = [(str(v).split('(')[0], v.ngram_range) for v in vectorizers]

tiny_table = []
for i,vec in enumerate(all_reports):
    tiny_table.append(
        {'vectorizer': vec_strings[i][0],
         'bigrams': vec_strings[i][1],
         'mnb': vec[i]['mnb'].T['accuracy'][0],
         'svm':vec[i]['svm'].T['accuracy'][0]
        })
    print(vec_strings[i])
    print(vec[i]['mnb'].T['accuracy'][0])
    print(vec[i]['svm'].T['accuracy'][0])

('CountVectorizer', (1, 1))
0.606401384083045
0.6241830065359477
('CountVectorizer', (1, 1))
0.606401384083045
0.6236864026656415
('CountVectorizer', (1, 2))
0.5973824170190952
0.6300941945405614
('TfidfVectorizer', (1, 1))
0.5836056644880174
0.6254325259515571
('TfidfVectorizer', (1, 1))
0.5836056644880174
0.6254325259515571
('TfidfVectorizer', (1, 2))
0.5948993976675637
0.6301262334999359

vec_strings = [(str(v).split('(')[0], v.ngram_range) for v in vectorizers]

vec_strings

[('CountVectorizer', (1, 1)),
 ('CountVectorizer', (1, 1)),
 ('CountVectorizer', (1, 2)),
 ('TfidfVectorizer', (1, 1)),
 ('TfidfVectorizer', (1, 1)),
 ('TfidfVectorizer', (1, 2))]

df

df = pd.DataFrame(tiny_table)
df_cv = df[df['vectorizer'] == 'CountVectorizer']
df.reset_index(inplace = True)
plt.figure()
sns.barplot(x="vectorizer",
           y="mnb",
           data=df)
plt.ylim(0.55,0.63)
plt.show()

df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df
# df.reset_index(inplace = True)

# df_m = pd.melt(df, id_vars="vectorizer", var_name="sentiment", value_name="score_value")
# df_m.reset_index(inplace=True)

# df_m
# df_m
df_cv = df[df['vectorizer'] == 'CountVectorizer']
df_cv_i = df_cv.reset_index()
df_cv_i
df_m = pd.melt(df_cv_i[['index','mnb','svm']], id_vars="index", var_name="vec", value_name="model")
df_m
# df_cv.reset_index(inplace=True)
# df.reset_index(inplace = True)
plt.figure()
sns.barplot(x="index",
           y="model",
            hue="vec",
           data=df_m)
plt.ylim(0.55,0.65)
plt.title('CountVectorizer')
plt.show()

df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df_cv = df[df['vectorizer'] == 'CountVectorizer']
df_cv_i = df_cv.reset_index()
df_m = pd.melt(df_cv_i[['index','mnb','svm']], id_vars="index", var_name="vec", value_name="model")
plt.figure()
sns.barplot(x="index",
           y="model",
            hue="vec",
           data=df_m)
plt.ylim(0.55,0.65)
plt.title('CountVectorizer')
plt.show()

df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df_cv = df[df['vectorizer'] == 'TfidfVectorizer']
df_cv_i = df_cv.reset_index()
df_m = pd.melt(df_cv_i[['index','mnb','svm']], id_vars="index", var_name="vec", value_name="model")
plt.figure()
sns.barplot(x="index",
           y="model",
            hue="vec",
           data=df_m)
plt.ylim(0.55,0.65)
plt.title('TfidfVectorizer')
plt.show()

df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df_m = pd.melt(df, id_vars="vectorizer", var_name="v", value_name="m")
df_cv = df_m[df_m['v'] == 'mnb']
df_cv
new_i = [0,1,2,0,1,2]
df_cv['new_i'] = new_i
plt.figure()
sns.barplot(x="new_i",
           y="m",
            hue="vectorizer",
           data=df_cv)
plt.ylim(0.55,0.65)
plt.title('MNB')
plt.show()
# df_cv

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys

df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df_m = pd.melt(df, id_vars="vectorizer", var_name="v", value_name="m")
df_cv = df_m[df_m['v'] == 'svm']
new_i = [0,1,2,0,1,2]
df_cv['new_i'] = new_i
plt.figure()
sns.barplot(x="new_i",
           y="m",
            hue="vectorizer",
           data=df_cv)
plt.ylim(0.55,0.65)
plt.title('SVM')
plt.show()

/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

df = pd.DataFrame(tiny_table)

df

# Your code starts here
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]==0 and y_pred[i]==4):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)

	precision	recall	f1-score	support	model
0	0.457839	0.250085	0.323477	2931.000000	mnb
1	0.497771	0.381744	0.432105	10824.000000	mnb
2	0.672674	0.808310	0.734281	31864.000000	mnb
3	0.512089	0.478114	0.494519	13068.000000	mnb
4	0.484744	0.263580	0.341480	3737.000000	mnb
accuracy	0.606401	0.606401	0.606401	0.606401	mnb
macro avg	0.525023	0.436367	0.465172	62424.000000	mnb
weighted avg	0.587392	0.606401	0.588889	62424.000000	mnb

	vectorizer	bigrams	mnb	svm
0	CountVectorizer	(1, 1)	0.606401	0.624183
1	CountVectorizer	(1, 1)	0.606401	0.623686
2	CountVectorizer	(1, 2)	0.597382	0.630094
3	TfidfVectorizer	(1, 1)	0.583606	0.625433
4	TfidfVectorizer	(1, 1)	0.583606	0.625433
5	TfidfVectorizer	(1, 2)	0.594899	0.630126