In [ ]:
 
In [7]:
## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string

from tabulate import tabulate
import numpy as np

# unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')

vectorizers = [
    CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english'),
    CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english'),
    CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english'),
#     CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=LemmaTokenizer()),
#     CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=my_tokenizer ),
    TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english'),
    TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, max_df=0.50, stop_words='english'),
    TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
]


def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def make_pretty_cm(y_test, y_pred, labels, target_names):
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    
    plt.figure(figsize=(18,9))    
    target_names = target_names
    index = target_names
    columns = target_names 
    cm_df = pd.DataFrame(cm,columns,index)                      
    plt.subplot(1, 2, 1)  
    sns.heatmap(cm_df, annot=True, cmap="Blues")
    
    cm_n = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    cm_df = pd.DataFrame(cm_n,columns,index)                      
    plt.subplot(1, 2, 2)  
    sns.heatmap(cm_df, annot=True, cmap="Blues")
    plt.show()

import seaborn as sns
import matplotlib.pyplot as plt
# m_df = mnb_report.copy()
# s_df = svm_report.copy()

def barplot_all(report):
    report.drop('support', inplace=True, axis=1)
    report = report.T.reset_index()
#     report = report.T
#     report.reset_index(inplace=True)
#     df5 = df4[df4.columns[0:3]]
    df4 = pd.melt(report, id_vars="index", var_name="sentiment", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.barplot(x="index",
               y="score_value",
               hue="sentiment",
               data=df4);
    plt.show()
    
def barplot_single_report(report):
    accuracy = report[5:6]['support'].values
    just_sent = report[:5]
    just_sent = just_sent[just_sent.columns[0:3]]
    just_sent.reset_index(inplace=True)
    df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.barplot(x="index",
               y="score_value",
               hue="sentiment",
               data=df);
    plt.title('ACCURACY: ' + str(accuracy))
    plt.show()

def barplot_both_report(m_df, s_df):
    m_df2 = m_df
    s_df2 = s_df
    m_df2['model'] = 'mnb'
    s_df2['model'] = 'svm'
    df = m_df2[:5].append(s_df2[:5])
    df.reset_index(inplace=True)
    df2 = df[['index','f1-score','model']]
    # df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.barplot(x="index",
               y="f1-score",
               hue="model",
               data=df2);
    plt.title('Comparing MNB & SVM')
    plt.show()

def run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    print('----'*10)
    print('##MNB')
    print('----'*10)
    
    mnb_clf = MultinomialNB()
    mnb_clf.fit(X_train_vec, y_train)
    y_pred = mnb_clf.predict(X_test_vec)
    
    make_pretty_cm(y_test, y_pred, labels, target_names)

    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)

    df = pd.DataFrame(report)
    print('======'*10)
    print('BY SENTIMENT')
    print('======'*10)
    print(tabulate(df[df.columns[0:4]], tablefmt="fancy_grid", headers=df.columns, floatfmt=".2f"))
    print('======'*10)
    print('BY PERFORMANCE')
    print('======'*10)
    print(tabulate(df[df.columns[5:8]], tablefmt="fancy_grid", headers=df.columns[5:8], floatfmt=".2f"))
    
    barplot_single_report(pd.DataFrame(report).T)
    barplot_all(pd.DataFrame(report).T)
    return pd.DataFrame(report).T, mnb_clf
    
    
def run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    print('----'*10)
    print('##SVM')
    print('----'*10)
    
    svm_clf = LinearSVC(C=1)
    svm_clf.fit(X_train_vec,y_train)
    y_pred = svm_clf.predict(X_test_vec)

    target_names = target_names
    make_pretty_cm(y_test, y_pred, labels, target_names)
    
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)

    df = pd.DataFrame(report)
    print('======'*10)
    print('BY SENTIMENT')
    print('======'*10)
    print(tabulate(df[df.columns[0:4]], tablefmt="fancy_grid", headers=df.columns, floatfmt=".2f"))
    print('======'*10)
    print('BY PERFORMANCE')
    print('======'*10)
    print(tabulate(df[df.columns[5:8]], tablefmt="fancy_grid", headers=df.columns[5:8], floatfmt=".2f"))
    
    barplot_single_report(pd.DataFrame(report).T)
    barplot_all(pd.DataFrame(report).T)
    return pd.DataFrame(report).T, svm_clf

    
def get_features(vec, thingy):
    feature_ranks = sorted(zip(thingy.coef_[0], vec.get_feature_names()))

    ## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
    very_negative_10 = feature_ranks[-10:]
    print("Very negative words")
    for i in range(0, len(very_negative_10)):
        print(very_negative_10[i])
    print()

    ## get 10 features that are least relevant to "very negative" sentiment (they are at the top of the ranked list)
    not_very_negative_10 = feature_ranks[:10]
    print("not very negative words")
    for i in range(0, len(not_very_negative_10)):
        print(not_very_negative_10[i])
    print()
    
def do_the_thing(X,y,labels, target_names):
    all_reports = []
    for i,vec in enumerate(vectorizers):
        

        
        params = vec.get_params()
        df = pd.DataFrame([params]).T
        vec_type = str(vec).split('(')[0]
        title = str(i)+ '_' + vec_type
        print(title)
        print(tabulate(df, tablefmt="fancy_grid", headers=df.columns, floatfmt=".2f"))

        X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
        mnb_report, mnb_clf = run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
#         svm_report, svm_clf = run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
#         barplot_both_report(mnb_report, svm_report)
        
        print('MNB FEATURES******************************')
        get_features(vec, mnb_clf)
#         print('SVM FEATURES******************************')
#         get_features(vec, svm_clf)
#         all_reports.append({ i : {'mnb': mnb_report, 'svm': svm_report} })
    return all_reports
In [8]:
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
all_reports = do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])
0_CountVectorizer
╒═══════════════╤═══════════════════════╕
│               │ 0                     │
╞═══════════════╪═══════════════════════╡
│ analyzer      │ word                  │
├───────────────┼───────────────────────┤
│ binary        │ True                  │
├───────────────┼───────────────────────┤
│ decode_error  │ strict                │
├───────────────┼───────────────────────┤
│ dtype         │ <class 'numpy.int64'> │
├───────────────┼───────────────────────┤
│ encoding      │ latin-1               │
├───────────────┼───────────────────────┤
│ input         │ content               │
├───────────────┼───────────────────────┤
│ lowercase     │ True                  │
├───────────────┼───────────────────────┤
│ max_df        │ 1.0                   │
├───────────────┼───────────────────────┤
│ max_features  │                       │
├───────────────┼───────────────────────┤
│ min_df        │ 5                     │
├───────────────┼───────────────────────┤
│ ngram_range   │ (1, 1)                │
├───────────────┼───────────────────────┤
│ preprocessor  │                       │
├───────────────┼───────────────────────┤
│ stop_words    │ english               │
├───────────────┼───────────────────────┤
│ strip_accents │                       │
├───────────────┼───────────────────────┤
│ token_pattern │ (?u)\b\w\w+\b         │
├───────────────┼───────────────────────┤
│ tokenizer     │                       │
├───────────────┼───────────────────────┤
│ vocabulary    │                       │
╘═══════════════╧═══════════════════════╛
----------------------------------------
##MNB
----------------------------------------
============================================================
BY SENTIMENT
============================================================
╒═══════════╤═════════╤══════════╤══════════╤══════════╕
│ 0         │       1 │        2 │        3 │        4 │
╞═══════════╪═════════╪══════════╪══════════╪══════════╡
│ precision │    0.46 │     0.50 │     0.67 │     0.51 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ recall    │    0.25 │     0.38 │     0.81 │     0.48 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ f1-score  │    0.32 │     0.43 │     0.73 │     0.49 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ support   │ 2931.00 │ 10824.00 │ 31864.00 │ 13068.00 │
╘═══════════╧═════════╧══════════╧══════════╧══════════╛
============================================================
BY PERFORMANCE
============================================================
╒═══════════╤════════════╤═════════════╤════════════════╕
│           │   accuracy │   macro avg │   weighted avg │
╞═══════════╪════════════╪═════════════╪════════════════╡
│ precision │       0.61 │        0.53 │           0.59 │
├───────────┼────────────┼─────────────┼────────────────┤
│ recall    │       0.61 │        0.44 │           0.61 │
├───────────┼────────────┼─────────────┼────────────────┤
│ f1-score  │       0.61 │        0.47 │           0.59 │
├───────────┼────────────┼─────────────┼────────────────┤
│ support   │       0.61 │    62424.00 │       62424.00 │
╘═══════════╧════════════╧═════════════╧════════════════╛
MNB FEATURES******************************
Very negative words
(-5.946405317563997, 'time')
(-5.935710028447249, 'minutes')
(-5.925127919116712, 'characters')
(-5.925127919116712, 'story')
(-5.90429383221387, 'comedy')
(-5.6998813176057235, 'just')
(-5.195801081979265, 'like')
(-5.071833039257134, 'bad')
(-4.847793028895888, 'film')
(-4.322025825131698, 'movie')

not very negative words
(-10.479004810717253, '102')
(-10.479004810717253, '10th')
(-10.479004810717253, '127')
(-10.479004810717253, '13th')
(-10.479004810717253, '14')
(-10.479004810717253, '16')
(-10.479004810717253, '163')
(-10.479004810717253, '168')
(-10.479004810717253, '170')
(-10.479004810717253, '1790')

1_CountVectorizer
╒═══════════════╤═══════════════════════╕
│               │ 0                     │
╞═══════════════╪═══════════════════════╡
│ analyzer      │ word                  │
├───────────────┼───────────────────────┤
│ binary        │ False                 │
├───────────────┼───────────────────────┤
│ decode_error  │ strict                │
├───────────────┼───────────────────────┤
│ dtype         │ <class 'numpy.int64'> │
├───────────────┼───────────────────────┤
│ encoding      │ latin-1               │
├───────────────┼───────────────────────┤
│ input         │ content               │
├───────────────┼───────────────────────┤
│ lowercase     │ True                  │
├───────────────┼───────────────────────┤
│ max_df        │ 1.0                   │
├───────────────┼───────────────────────┤
│ max_features  │                       │
├───────────────┼───────────────────────┤
│ min_df        │ 5                     │
├───────────────┼───────────────────────┤
│ ngram_range   │ (1, 1)                │
├───────────────┼───────────────────────┤
│ preprocessor  │                       │
├───────────────┼───────────────────────┤
│ stop_words    │ english               │
├───────────────┼───────────────────────┤
│ strip_accents │                       │
├───────────────┼───────────────────────┤
│ token_pattern │ (?u)\b\w\w+\b         │
├───────────────┼───────────────────────┤
│ tokenizer     │                       │
├───────────────┼───────────────────────┤
│ vocabulary    │                       │
╘═══════════════╧═══════════════════════╛
----------------------------------------
##MNB
----------------------------------------
============================================================
BY SENTIMENT
============================================================
╒═══════════╤═════════╤══════════╤══════════╤══════════╕
│ 0         │       1 │        2 │        3 │        4 │
╞═══════════╪═════════╪══════════╪══════════╪══════════╡
│ precision │    0.46 │     0.50 │     0.67 │     0.51 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ recall    │    0.25 │     0.38 │     0.81 │     0.48 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ f1-score  │    0.33 │     0.43 │     0.73 │     0.49 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ support   │ 2931.00 │ 10824.00 │ 31864.00 │ 13068.00 │
╘═══════════╧═════════╧══════════╧══════════╧══════════╛
============================================================
BY PERFORMANCE
============================================================
╒═══════════╤════════════╤═════════════╤════════════════╕
│           │   accuracy │   macro avg │   weighted avg │
╞═══════════╪════════════╪═════════════╪════════════════╡
│ precision │       0.61 │        0.52 │           0.59 │
├───────────┼────────────┼─────────────┼────────────────┤
│ recall    │       0.61 │        0.44 │           0.61 │
├───────────┼────────────┼─────────────┼────────────────┤
│ f1-score  │       0.61 │        0.47 │           0.59 │
├───────────┼────────────┼─────────────┼────────────────┤
│ support   │       0.61 │    62424.00 │       62424.00 │
╘═══════════╧════════════╧═════════════╧════════════════╛
MNB FEATURES******************************
Very negative words
(-5.941598005980322, 'time')
(-5.931015896649785, 'characters')
(-5.92054459678249, 'minutes')
(-5.92054459678249, 'story')
(-5.910181809746943, 'comedy')
(-5.689102242653584, 'just')
(-5.137785257532857, 'like')
(-4.975504451622348, 'bad')
(-4.832403607981675, 'film')
(-4.3215779842156845, 'movie')

not very negative words
(-10.484892788250326, '102')
(-10.484892788250326, '10th')
(-10.484892788250326, '127')
(-10.484892788250326, '13th')
(-10.484892788250326, '14')
(-10.484892788250326, '16')
(-10.484892788250326, '163')
(-10.484892788250326, '168')
(-10.484892788250326, '170')
(-10.484892788250326, '1790')

2_CountVectorizer
╒═══════════════╤═══════════════════════╕
│               │ 0                     │
╞═══════════════╪═══════════════════════╡
│ analyzer      │ word                  │
├───────────────┼───────────────────────┤
│ binary        │ False                 │
├───────────────┼───────────────────────┤
│ decode_error  │ strict                │
├───────────────┼───────────────────────┤
│ dtype         │ <class 'numpy.int64'> │
├───────────────┼───────────────────────┤
│ encoding      │ latin-1               │
├───────────────┼───────────────────────┤
│ input         │ content               │
├───────────────┼───────────────────────┤
│ lowercase     │ True                  │
├───────────────┼───────────────────────┤
│ max_df        │ 1.0                   │
├───────────────┼───────────────────────┤
│ max_features  │                       │
├───────────────┼───────────────────────┤
│ min_df        │ 5                     │
├───────────────┼───────────────────────┤
│ ngram_range   │ (1, 2)                │
├───────────────┼───────────────────────┤
│ preprocessor  │                       │
├───────────────┼───────────────────────┤
│ stop_words    │ english               │
├───────────────┼───────────────────────┤
│ strip_accents │                       │
├───────────────┼───────────────────────┤
│ token_pattern │ (?u)\b\w\w+\b         │
├───────────────┼───────────────────────┤
│ tokenizer     │                       │
├───────────────┼───────────────────────┤
│ vocabulary    │                       │
╘═══════════════╧═══════════════════════╛
----------------------------------------
##MNB
----------------------------------------
============================================================
BY SENTIMENT
============================================================
╒═══════════╤═════════╤══════════╤══════════╤══════════╕
│ 0         │       1 │        2 │        3 │        4 │
╞═══════════╪═════════╪══════════╪══════════╪══════════╡
│ precision │    0.40 │     0.48 │     0.68 │     0.51 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ recall    │    0.30 │     0.41 │     0.77 │     0.49 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ f1-score  │    0.34 │     0.44 │     0.72 │     0.50 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ support   │ 2931.00 │ 10824.00 │ 31864.00 │ 13068.00 │
╘═══════════╧═════════╧══════════╧══════════╧══════════╛
============================================================
BY PERFORMANCE
============================================================
╒═══════════╤════════════╤═════════════╤════════════════╕
│           │   accuracy │   macro avg │   weighted avg │
╞═══════════╪════════════╪═════════════╪════════════════╡
│ precision │       0.60 │        0.50 │           0.58 │
├───────────┼────────────┼─────────────┼────────────────┤
│ recall    │       0.60 │        0.45 │           0.60 │
├───────────┼────────────┼─────────────┼────────────────┤
│ f1-score  │       0.60 │        0.47 │           0.59 │
├───────────┼────────────┼─────────────┼────────────────┤
│ support   │       0.60 │    62424.00 │       62424.00 │
╘═══════════╧════════════╧═════════════╧════════════════╛
MNB FEATURES******************************
Very negative words
(-6.629714522576447, 'time')
(-6.61913241324591, 'characters')
(-6.608661113378615, 'minutes')
(-6.608661113378615, 'story')
(-6.598298326343068, 'comedy')
(-6.377218759249709, 'just')
(-5.825901774128982, 'like')
(-5.663620968218473, 'bad')
(-5.5205201245778, 'film')
(-5.0096945008118094, 'movie')

not very negative words
(-11.17300930484645, '10 course')
(-11.17300930484645, '10 year')
(-11.17300930484645, '100 minute')
(-11.17300930484645, '100 years')
(-11.17300930484645, '101 minutes')
(-11.17300930484645, '101 premise')
(-11.17300930484645, '102')
(-11.17300930484645, '102 minute')
(-11.17300930484645, '10th')
(-11.17300930484645, '10th grade')

3_TfidfVectorizer
╒═══════════════╤═════════════════════════╕
│               │ 0                       │
╞═══════════════╪═════════════════════════╡
│ analyzer      │ word                    │
├───────────────┼─────────────────────────┤
│ binary        │ False                   │
├───────────────┼─────────────────────────┤
│ decode_error  │ strict                  │
├───────────────┼─────────────────────────┤
│ dtype         │ <class 'numpy.float64'> │
├───────────────┼─────────────────────────┤
│ encoding      │ latin-1                 │
├───────────────┼─────────────────────────┤
│ input         │ content                 │
├───────────────┼─────────────────────────┤
│ lowercase     │ True                    │
├───────────────┼─────────────────────────┤
│ max_df        │ 1.0                     │
├───────────────┼─────────────────────────┤
│ max_features  │                         │
├───────────────┼─────────────────────────┤
│ min_df        │ 5                       │
├───────────────┼─────────────────────────┤
│ ngram_range   │ (1, 1)                  │
├───────────────┼─────────────────────────┤
│ norm          │ l2                      │
├───────────────┼─────────────────────────┤
│ preprocessor  │                         │
├───────────────┼─────────────────────────┤
│ smooth_idf    │ True                    │
├───────────────┼─────────────────────────┤
│ stop_words    │ english                 │
├───────────────┼─────────────────────────┤
│ strip_accents │                         │
├───────────────┼─────────────────────────┤
│ sublinear_tf  │ False                   │
├───────────────┼─────────────────────────┤
│ token_pattern │ (?u)\b\w\w+\b           │
├───────────────┼─────────────────────────┤
│ tokenizer     │                         │
├───────────────┼─────────────────────────┤
│ use_idf       │ True                    │
├───────────────┼─────────────────────────┤
│ vocabulary    │                         │
╘═══════════════╧═════════════════════════╛
----------------------------------------
##MNB
----------------------------------------
============================================================
BY SENTIMENT
============================================================
╒═══════════╤═════════╤══════════╤══════════╤══════════╕
│ 0         │       1 │        2 │        3 │        4 │
╞═══════════╪═════════╪══════════╪══════════╪══════════╡
│ precision │    0.57 │     0.51 │     0.60 │     0.52 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ recall    │    0.04 │     0.24 │     0.90 │     0.37 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ f1-score  │    0.07 │     0.33 │     0.72 │     0.43 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ support   │ 2931.00 │ 10824.00 │ 31864.00 │ 13068.00 │
╘═══════════╧═════════╧══════════╧══════════╧══════════╛
============================================================
BY PERFORMANCE
============================================================
╒═══════════╤════════════╤═════════════╤════════════════╕
│           │   accuracy │   macro avg │   weighted avg │
╞═══════════╪════════════╪═════════════╪════════════════╡
│ precision │       0.58 │        0.57 │           0.57 │
├───────────┼────────────┼─────────────┼────────────────┤
│ recall    │       0.58 │        0.32 │           0.58 │
├───────────┼────────────┼─────────────┼────────────────┤
│ f1-score  │       0.58 │        0.33 │           0.53 │
├───────────┼────────────┼─────────────┼────────────────┤
│ support   │       0.58 │    62424.00 │       62424.00 │
╘═══════════╧════════════╧═════════════╧════════════════╛
MNB FEATURES******************************
Very negative words
(-6.645979011120781, 'time')
(-6.62020923362483, 'does')
(-6.60369986228579, 'minutes')
(-6.517311233308192, 'dull')
(-6.355845020937053, 'just')
(-6.131355008437069, 'worst')
(-6.029810232070883, 'like')
(-5.792621365210278, 'film')
(-5.411480448479903, 'bad')
(-5.186697744581525, 'movie')

not very negative words
(-9.958278961750986, '102')
(-9.958278961750986, '10th')
(-9.958278961750986, '127')
(-9.958278961750986, '13th')
(-9.958278961750986, '14')
(-9.958278961750986, '16')
(-9.958278961750986, '163')
(-9.958278961750986, '168')
(-9.958278961750986, '170')
(-9.958278961750986, '1790')

4_TfidfVectorizer
╒═══════════════╤═════════════════════════╕
│               │ 0                       │
╞═══════════════╪═════════════════════════╡
│ analyzer      │ word                    │
├───────────────┼─────────────────────────┤
│ binary        │ False                   │
├───────────────┼─────────────────────────┤
│ decode_error  │ strict                  │
├───────────────┼─────────────────────────┤
│ dtype         │ <class 'numpy.float64'> │
├───────────────┼─────────────────────────┤
│ encoding      │ latin-1                 │
├───────────────┼─────────────────────────┤
│ input         │ content                 │
├───────────────┼─────────────────────────┤
│ lowercase     │ True                    │
├───────────────┼─────────────────────────┤
│ max_df        │ 0.5                     │
├───────────────┼─────────────────────────┤
│ max_features  │                         │
├───────────────┼─────────────────────────┤
│ min_df        │ 5                       │
├───────────────┼─────────────────────────┤
│ ngram_range   │ (1, 1)                  │
├───────────────┼─────────────────────────┤
│ norm          │ l2                      │
├───────────────┼─────────────────────────┤
│ preprocessor  │                         │
├───────────────┼─────────────────────────┤
│ smooth_idf    │ True                    │
├───────────────┼─────────────────────────┤
│ stop_words    │ english                 │
├───────────────┼─────────────────────────┤
│ strip_accents │                         │
├───────────────┼─────────────────────────┤
│ sublinear_tf  │ False                   │
├───────────────┼─────────────────────────┤
│ token_pattern │ (?u)\b\w\w+\b           │
├───────────────┼─────────────────────────┤
│ tokenizer     │                         │
├───────────────┼─────────────────────────┤
│ use_idf       │ True                    │
├───────────────┼─────────────────────────┤
│ vocabulary    │                         │
╘═══════════════╧═════════════════════════╛
----------------------------------------
##MNB
----------------------------------------
============================================================
BY SENTIMENT
============================================================
╒═══════════╤═════════╤══════════╤══════════╤══════════╕
│ 0         │       1 │        2 │        3 │        4 │
╞═══════════╪═════════╪══════════╪══════════╪══════════╡
│ precision │    0.57 │     0.51 │     0.60 │     0.52 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ recall    │    0.04 │     0.24 │     0.90 │     0.37 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ f1-score  │    0.07 │     0.33 │     0.72 │     0.43 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ support   │ 2931.00 │ 10824.00 │ 31864.00 │ 13068.00 │
╘═══════════╧═════════╧══════════╧══════════╧══════════╛
============================================================
BY PERFORMANCE
============================================================
╒═══════════╤════════════╤═════════════╤════════════════╕
│           │   accuracy │   macro avg │   weighted avg │
╞═══════════╪════════════╪═════════════╪════════════════╡
│ precision │       0.58 │        0.57 │           0.57 │
├───────────┼────────────┼─────────────┼────────────────┤
│ recall    │       0.58 │        0.32 │           0.58 │
├───────────┼────────────┼─────────────┼────────────────┤
│ f1-score  │       0.58 │        0.33 │           0.53 │
├───────────┼────────────┼─────────────┼────────────────┤
│ support   │       0.58 │    62424.00 │       62424.00 │
╘═══════════╧════════════╧═════════════╧════════════════╛
MNB FEATURES******************************
Very negative words
(-6.645979011120781, 'time')
(-6.62020923362483, 'does')
(-6.60369986228579, 'minutes')
(-6.517311233308192, 'dull')
(-6.355845020937053, 'just')
(-6.131355008437069, 'worst')
(-6.029810232070883, 'like')
(-5.792621365210278, 'film')
(-5.411480448479903, 'bad')
(-5.186697744581525, 'movie')

not very negative words
(-9.958278961750986, '102')
(-9.958278961750986, '10th')
(-9.958278961750986, '127')
(-9.958278961750986, '13th')
(-9.958278961750986, '14')
(-9.958278961750986, '16')
(-9.958278961750986, '163')
(-9.958278961750986, '168')
(-9.958278961750986, '170')
(-9.958278961750986, '1790')

5_TfidfVectorizer
╒═══════════════╤═════════════════════════╕
│               │ 0                       │
╞═══════════════╪═════════════════════════╡
│ analyzer      │ word                    │
├───────────────┼─────────────────────────┤
│ binary        │ False                   │
├───────────────┼─────────────────────────┤
│ decode_error  │ strict                  │
├───────────────┼─────────────────────────┤
│ dtype         │ <class 'numpy.float64'> │
├───────────────┼─────────────────────────┤
│ encoding      │ latin-1                 │
├───────────────┼─────────────────────────┤
│ input         │ content                 │
├───────────────┼─────────────────────────┤
│ lowercase     │ True                    │
├───────────────┼─────────────────────────┤
│ max_df        │ 1.0                     │
├───────────────┼─────────────────────────┤
│ max_features  │                         │
├───────────────┼─────────────────────────┤
│ min_df        │ 5                       │
├───────────────┼─────────────────────────┤
│ ngram_range   │ (1, 2)                  │
├───────────────┼─────────────────────────┤
│ norm          │ l2                      │
├───────────────┼─────────────────────────┤
│ preprocessor  │                         │
├───────────────┼─────────────────────────┤
│ smooth_idf    │ True                    │
├───────────────┼─────────────────────────┤
│ stop_words    │ english                 │
├───────────────┼─────────────────────────┤
│ strip_accents │                         │
├───────────────┼─────────────────────────┤
│ sublinear_tf  │ False                   │
├───────────────┼─────────────────────────┤
│ token_pattern │ (?u)\b\w\w+\b           │
├───────────────┼─────────────────────────┤
│ tokenizer     │                         │
├───────────────┼─────────────────────────┤
│ use_idf       │ True                    │
├───────────────┼─────────────────────────┤
│ vocabulary    │                         │
╘═══════════════╧═════════════════════════╛
----------------------------------------
##MNB
----------------------------------------
============================================================
BY SENTIMENT
============================================================
╒═══════════╤═════════╤══════════╤══════════╤══════════╕
│ 0         │       1 │        2 │        3 │        4 │
╞═══════════╪═════════╪══════════╪══════════╪══════════╡
│ precision │    0.65 │     0.53 │     0.61 │     0.54 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ recall    │    0.06 │     0.26 │     0.90 │     0.39 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ f1-score  │    0.11 │     0.35 │     0.73 │     0.45 │
├───────────┼─────────┼──────────┼──────────┼──────────┤
│ support   │ 2931.00 │ 10824.00 │ 31864.00 │ 13068.00 │
╘═══════════╧═════════╧══════════╧══════════╧══════════╛
============================================================
BY PERFORMANCE
============================================================
╒═══════════╤════════════╤═════════════╤════════════════╕
│           │   accuracy │   macro avg │   weighted avg │
╞═══════════╪════════════╪═════════════╪════════════════╡
│ precision │       0.59 │        0.60 │           0.59 │
├───────────┼────────────┼─────────────┼────────────────┤
│ recall    │       0.59 │        0.34 │           0.59 │
├───────────┼────────────┼─────────────┼────────────────┤
│ f1-score  │       0.59 │        0.36 │           0.54 │
├───────────┼────────────┼─────────────┼────────────────┤
│ support   │       0.59 │    62424.00 │       62424.00 │
╘═══════════╧════════════╧═════════════╧════════════════╛
MNB FEATURES******************************
Very negative words
(-7.670900924532225, 'story')
(-7.654743793752857, 'stupid')
(-7.624189077492526, 'mess')
(-7.498580938421712, 'dull')
(-7.393303232254995, 'just')
(-7.209695440732919, 'worst')
(-7.108870852486083, 'like')
(-6.852838351434956, 'film')
(-6.464253994333886, 'bad')
(-6.241813501470778, 'movie')

not very negative words
(-10.730617771376618, '10 course')
(-10.730617771376618, '10 year')
(-10.730617771376618, '100 minute')
(-10.730617771376618, '100 years')
(-10.730617771376618, '101 minutes')
(-10.730617771376618, '101 premise')
(-10.730617771376618, '102')
(-10.730617771376618, '102 minute')
(-10.730617771376618, '10th')
(-10.730617771376618, '10th grade')

In [327]:
all_reports[0][0]['mnb']
Out[327]:
precision recall f1-score support model
0 0.457839 0.250085 0.323477 2931.000000 mnb
1 0.497771 0.381744 0.432105 10824.000000 mnb
2 0.672674 0.808310 0.734281 31864.000000 mnb
3 0.512089 0.478114 0.494519 13068.000000 mnb
4 0.484744 0.263580 0.341480 3737.000000 mnb
accuracy 0.606401 0.606401 0.606401 0.606401 mnb
macro avg 0.525023 0.436367 0.465172 62424.000000 mnb
weighted avg 0.587392 0.606401 0.588889 62424.000000 mnb
In [349]:
vec_strings = [(str(v).split('(')[0], v.ngram_range) for v in vectorizers]

tiny_table = []
for i,vec in enumerate(all_reports):
    tiny_table.append(
        {'vectorizer': vec_strings[i][0],
         'bigrams': vec_strings[i][1],
         'mnb': vec[i]['mnb'].T['accuracy'][0],
         'svm':vec[i]['svm'].T['accuracy'][0]
        })
    print(vec_strings[i])
    print(vec[i]['mnb'].T['accuracy'][0])
    print(vec[i]['svm'].T['accuracy'][0])
('CountVectorizer', (1, 1))
0.606401384083045
0.6241830065359477
('CountVectorizer', (1, 1))
0.606401384083045
0.6236864026656415
('CountVectorizer', (1, 2))
0.5973824170190952
0.6300941945405614
('TfidfVectorizer', (1, 1))
0.5836056644880174
0.6254325259515571
('TfidfVectorizer', (1, 1))
0.5836056644880174
0.6254325259515571
('TfidfVectorizer', (1, 2))
0.5948993976675637
0.6301262334999359
In [350]:
vec_strings = [(str(v).split('(')[0], v.ngram_range) for v in vectorizers]
In [351]:
vec_strings
Out[351]:
[('CountVectorizer', (1, 1)),
 ('CountVectorizer', (1, 1)),
 ('CountVectorizer', (1, 2)),
 ('TfidfVectorizer', (1, 1)),
 ('TfidfVectorizer', (1, 1)),
 ('TfidfVectorizer', (1, 2))]
In [352]:
 
In [353]:
df
Out[353]:
vectorizer bigrams mnb svm
0 CountVectorizer (1, 1) 0.606401 0.624183
1 CountVectorizer (1, 1) 0.606401 0.623686
2 CountVectorizer (1, 2) 0.597382 0.630094
3 TfidfVectorizer (1, 1) 0.583606 0.625433
4 TfidfVectorizer (1, 1) 0.583606 0.625433
5 TfidfVectorizer (1, 2) 0.594899 0.630126
In [363]:
df = pd.DataFrame(tiny_table)
df_cv = df[df['vectorizer'] == 'CountVectorizer']
df.reset_index(inplace = True)
plt.figure()
sns.barplot(x="vectorizer",
           y="mnb",
           data=df)
plt.ylim(0.55,0.63)
plt.show()
In [395]:
df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df
# df.reset_index(inplace = True)

# df_m = pd.melt(df, id_vars="vectorizer", var_name="sentiment", value_name="score_value")
# df_m.reset_index(inplace=True)

# df_m
# df_m
df_cv = df[df['vectorizer'] == 'CountVectorizer']
df_cv_i = df_cv.reset_index()
df_cv_i
df_m = pd.melt(df_cv_i[['index','mnb','svm']], id_vars="index", var_name="vec", value_name="model")
df_m
# df_cv.reset_index(inplace=True)
# df.reset_index(inplace = True)
plt.figure()
sns.barplot(x="index",
           y="model",
            hue="vec",
           data=df_m)
plt.ylim(0.55,0.65)
plt.title('CountVectorizer')
plt.show()
In [396]:
df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df_cv = df[df['vectorizer'] == 'CountVectorizer']
df_cv_i = df_cv.reset_index()
df_m = pd.melt(df_cv_i[['index','mnb','svm']], id_vars="index", var_name="vec", value_name="model")
plt.figure()
sns.barplot(x="index",
           y="model",
            hue="vec",
           data=df_m)
plt.ylim(0.55,0.65)
plt.title('CountVectorizer')
plt.show()
In [399]:
df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df_cv = df[df['vectorizer'] == 'TfidfVectorizer']
df_cv_i = df_cv.reset_index()
df_m = pd.melt(df_cv_i[['index','mnb','svm']], id_vars="index", var_name="vec", value_name="model")
plt.figure()
sns.barplot(x="index",
           y="model",
            hue="vec",
           data=df_m)
plt.ylim(0.55,0.65)
plt.title('TfidfVectorizer')
plt.show()
In [422]:
df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df_m = pd.melt(df, id_vars="vectorizer", var_name="v", value_name="m")
df_cv = df_m[df_m['v'] == 'mnb']
df_cv
new_i = [0,1,2,0,1,2]
df_cv['new_i'] = new_i
plt.figure()
sns.barplot(x="new_i",
           y="m",
            hue="vectorizer",
           data=df_cv)
plt.ylim(0.55,0.65)
plt.title('MNB')
plt.show()
# df_cv
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
In [423]:
df = pd.DataFrame(tiny_table)
df = df[['vectorizer','mnb','svm']]
df_m = pd.melt(df, id_vars="vectorizer", var_name="v", value_name="m")
df_cv = df_m[df_m['v'] == 'svm']
new_i = [0,1,2,0,1,2]
df_cv['new_i'] = new_i
plt.figure()
sns.barplot(x="new_i",
           y="m",
            hue="vectorizer",
           data=df_cv)
plt.ylim(0.55,0.65)
plt.title('SVM')
plt.show()
/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [417]:
df = pd.DataFrame(tiny_table)
In [418]:
df
Out[418]:
vectorizer bigrams mnb svm
0 CountVectorizer (1, 1) 0.606401 0.624183
1 CountVectorizer (1, 1) 0.606401 0.623686
2 CountVectorizer (1, 2) 0.597382 0.630094
3 TfidfVectorizer (1, 1) 0.583606 0.625433
4 TfidfVectorizer (1, 1) 0.583606 0.625433
5 TfidfVectorizer (1, 2) 0.594899 0.630126
In [ ]:
# Your code starts here
err_cnt = 0
for i in range(0, len(y_test)):
    if(y_test[i]==0 and y_pred[i]==4):
        print(X_test[i])
        err_cnt = err_cnt+1
print("errors:", err_cnt)