HW7: Comparing MNB & SVM with Kaggle Sentiment Data¶

OVERVIEW¶

VECTORIZERS USED:¶

CountVectorizer
TfidfVectorizer

MODELS USED:¶

Multinomial Naive Bayes (MNB)
Support Vector Machines (SVM)

VECTORIZATION PARAMS:¶

Binary
Stopwords
Unigrams, Bigrams
Min & Max df

TODO:¶

Stemming?
Vadar + TextBlob

FUNCTION & PACKAGE PARTY¶

## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', 
                                     token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', 
                             token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', 
                                token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
    clf = classifier
    clf.fit(X_train_vec,y_train)
    y_pred = clf.predict(X_test_vec)
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    score = clf.score(X_test_vec,y_test)
    return clf, score, report
    
def get_model(X, y, labels, target_names, classifier, vec):
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
    return model, score, report
    
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd

DATA GOES HERE:¶

# import pandas as pd
# train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
# y=train['Sentiment'].values
# X=train['Phrase'].values


import pandas as pd
import numpy as np
df = pd.read_csv('../death_row_discritized.csv')

def to_string(tokens):
    try:
        return " ".join(eval(tokens))
    except:
        return "error"
    
column_name = 'time_spent'
df['statement_string'] = df.apply(lambda x: to_string(x['last_statement']), axis=1)
# y=df['vic_kid'].values
y=df[column_name].values


y = [value if type(value) == str else y[0] for value in y]
y_labels = list(set(y))
X=df['statement_string'].values
y_labels


def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============', column_name,': ', y_labels[i])
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

def update_big_df(big_df, new_row):
    big_df.append(new_row)
    df = pd.DataFrame(big_df)
    df = df.drop_duplicates()
    return df

TASK 1¶

TEST 1 -- MNB & SVM with Vectorizer 1¶

big_df = []

vec = unigram_bool_cv_v1
classifier = mnb

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df

============ time_spent :  10_or_less
╒════╤════════╤════════════╤═════════╤══════════════════════╕
│    │   Most │ Likely     │   Least │ Likely               │
╞════╪════════╪════════════╪═════════╪══════════════════════╡
│  0 │  -7.99 │ especially │   -4.35 │ thank                │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  1 │  -7.29 │ brought    │   -4.32 │ im                   │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  2 │  -7.29 │ doesnt     │   -4.32 │ sorry                │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  3 │  -7.29 │ doing      │   -4.25 │ god                  │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  4 │  -7.29 │ faith      │   -4.20 │ know                 │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  5 │  -7.29 │ given      │   -4.10 │ want                 │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  6 │  -7.29 │ grace      │   -3.93 │ family               │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  7 │  -7.29 │ human      │   -3.59 │ love                 │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  8 │  -7.29 │ jack       │   -3.26 │ pronoun              │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  9 │  -7.29 │ joe        │   -3.20 │ first_person_pronoun │
╘════╧════════╧════════════╧═════════╧══════════════════════╛

vec = unigram_bool_cv_v1
classifier = svm

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

============ time_spent :  10_or_less
╒════╤════════╤══════════╤═════════╤═══════════╕
│    │   Most │ Likely   │   Least │ Likely    │
╞════╪════════╪══════════╪═════════╪═══════════╡
│  0 │  -1.16 │ faith    │    0.80 │ today     │
├────┼────────┼──────────┼─────────┼───────────┤
│  1 │  -1.12 │ taking   │    0.85 │ change    │
├────┼────────┼──────────┼─────────┼───────────┤
│  2 │  -1.05 │ jack     │    0.86 │ someday   │
├────┼────────┼──────────┼─────────┼───────────┤
│  3 │  -0.91 │ brought  │    0.93 │ shown     │
├────┼────────┼──────────┼─────────┼───────────┤
│  4 │  -0.86 │ thanks   │    0.94 │ happening │
├────┼────────┼──────────┼─────────┼───────────┤
│  5 │  -0.85 │ brings   │    0.97 │ heaven    │
├────┼────────┼──────────┼─────────┼───────────┤
│  6 │  -0.82 │ guilty   │    1.00 │ send      │
├────┼────────┼──────────┼─────────┼───────────┤
│  7 │  -0.82 │ allah    │    1.04 │ david     │
├────┼────────┼──────────┼─────────┼───────────┤
│  8 │  -0.81 │ loves    │    1.06 │ strong    │
├────┼────────┼──────────┼─────────┼───────────┤
│  9 │  -0.80 │ john     │    1.20 │ committed │
╘════╧════════╧══════════╧═════════╧═══════════╛

NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not.

TEST 2 -- MNB & SVM with Vectorizer 2¶

vec = unigram_bool_cv_v2
classifier = mnb


model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
df

============ time_spent :  10_or_less
╒════╤════════╤════════════╤═════════╤══════════╕
│    │   Most │ Likely     │   Least │ Likely   │
╞════╪════════╪════════════╪═════════╪══════════╡
│  0 │  -7.95 │ especially │   -4.34 │ like     │
├────┼────────┼────────────┼─────────┼──────────┤
│  1 │  -7.25 │ brought    │   -4.31 │ thank    │
├────┼────────┼────────────┼─────────┼──────────┤
│  2 │  -7.25 │ doesnt     │   -4.28 │ im       │
├────┼────────┼────────────┼─────────┼──────────┤
│  3 │  -7.25 │ doing      │   -4.28 │ sorry    │
├────┼────────┼────────────┼─────────┼──────────┤
│  4 │  -7.25 │ faith      │   -4.21 │ god      │
├────┼────────┼────────────┼─────────┼──────────┤
│  5 │  -7.25 │ given      │   -4.16 │ know     │
├────┼────────┼────────────┼─────────┼──────────┤
│  6 │  -7.25 │ grace      │   -4.05 │ want     │
├────┼────────┼────────────┼─────────┼──────────┤
│  7 │  -7.25 │ human      │   -3.89 │ family   │
├────┼────────┼────────────┼─────────┼──────────┤
│  8 │  -7.25 │ jack       │   -3.55 │ love     │
├────┼────────┼────────────┼─────────┼──────────┤
│  9 │  -7.25 │ joe        │   -3.22 │ pronoun  │
╘════╧════════╧════════════╧═════════╧══════════╛

vec = unigram_bool_cv_v2
classifier = svm

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
df

============ time_spent :  10_or_less
╒════╤════════╤══════════╤═════════╤═══════════╕
│    │   Most │ Likely   │   Least │ Likely    │
╞════╪════════╪══════════╪═════════╪═══════════╡
│  0 │  -1.18 │ faith    │    0.83 │ loved     │
├────┼────────┼──────────┼─────────┼───────────┤
│  1 │  -1.13 │ taking   │    0.85 │ change    │
├────┼────────┼──────────┼─────────┼───────────┤
│  2 │  -1.04 │ jack     │    0.88 │ someday   │
├────┼────────┼──────────┼─────────┼───────────┤
│  3 │  -0.95 │ brought  │    0.89 │ happening │
├────┼────────┼──────────┼─────────┼───────────┤
│  4 │  -0.90 │ guilty   │    0.92 │ shown     │
├────┼────────┼──────────┼─────────┼───────────┤
│  5 │  -0.86 │ brings   │    1.00 │ heaven    │
├────┼────────┼──────────┼─────────┼───────────┤
│  6 │  -0.85 │ allah    │    1.03 │ strong    │
├────┼────────┼──────────┼─────────┼───────────┤
│  7 │  -0.84 │ thanks   │    1.06 │ send      │
├────┼────────┼──────────┼─────────┼───────────┤
│  8 │  -0.80 │ john     │    1.06 │ david     │
├────┼────────┼──────────┼─────────┼───────────┤
│  9 │  -0.77 │ promise  │    1.19 │ committed │
╘════╧════════╧══════════╧═════════╧═══════════╛

TEST 3 -- MNB & SVM with Vectorizer 3¶

vec = unigram_cv
classifier = mnb


model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
df

============ time_spent :  10_or_less
╒════╤════════╤════════════╤═════════╤══════════╕
│    │   Most │ Likely     │   Least │ Likely   │
╞════╪════════╪════════════╪═════════╪══════════╡
│  0 │  -8.43 │ especially │   -4.33 │ im       │
├────┼────────┼────────────┼─────────┼──────────┤
│  1 │  -7.73 │ doesnt     │   -4.25 │ yall     │
├────┼────────┼────────────┼─────────┼──────────┤
│  2 │  -7.73 │ given      │   -4.16 │ sorry    │
├────┼────────┼────────────┼─────────┼──────────┤
│  3 │  -7.73 │ grace      │   -4.16 │ thank    │
├────┼────────┼────────────┼─────────┼──────────┤
│  4 │  -7.73 │ jack       │   -4.11 │ god      │
├────┼────────┼────────────┼─────────┼──────────┤
│  5 │  -7.73 │ joe        │   -3.93 │ want     │
├────┼────────┼────────────┼─────────┼──────────┤
│  6 │  -7.73 │ john       │   -3.91 │ family   │
├────┼────────┼────────────┼─────────┼──────────┤
│  7 │  -7.73 │ lived      │   -3.73 │ know     │
├────┼────────┼────────────┼─────────┼──────────┤
│  8 │  -7.73 │ members    │   -3.13 │ love     │
├────┼────────┼────────────┼─────────┼──────────┤
│  9 │  -7.73 │ showed     │   -1.69 │ pronoun  │
╘════╧════════╧════════════╧═════════╧══════════╛

vec = unigram_cv
classifier = svm


model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
df

============ time_spent :  10_or_less
╒════╤════════╤══════════╤═════════╤═══════════╕
│    │   Most │ Likely   │   Least │ Likely    │
╞════╪════════╪══════════╪═════════╪═══════════╡
│  0 │  -1.16 │ taking   │    0.75 │ fear      │
├────┼────────┼──────────┼─────────┼───────────┤
│  1 │  -1.11 │ faith    │    0.76 │ strong    │
├────┼────────┼──────────┼─────────┼───────────┤
│  2 │  -1.08 │ jack     │    0.77 │ change    │
├────┼────────┼──────────┼─────────┼───────────┤
│  3 │  -0.94 │ brought  │    0.79 │ david     │
├────┼────────┼──────────┼─────────┼───────────┤
│  4 │  -0.87 │ amen     │    0.81 │ send      │
├────┼────────┼──────────┼─────────┼───────────┤
│  5 │  -0.86 │ words    │    0.82 │ happening │
├────┼────────┼──────────┼─────────┼───────────┤
│  6 │  -0.84 │ row      │    0.86 │ someday   │
├────┼────────┼──────────┼─────────┼───────────┤
│  7 │  -0.83 │ best     │    0.87 │ world     │
├────┼────────┼──────────┼─────────┼───────────┤
│  8 │  -0.81 │ blessing │    0.94 │ waiting   │
├────┼────────┼──────────┼─────────┼───────────┤
│  9 │  -0.77 │ guilty   │    1.27 │ committed │
╘════╧════════╧══════════╧═════════╧═══════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)

TEST 4 -- MNB & SVM with Vectorizer 4¶

vec = bigram_cv
classifier = mnb

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})

classifier = svm

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
df

============ time_spent :  10_or_less
╒════╤════════╤═══════════════════════════════╤═════════╤═══════════════════════════════════════════╕
│    │   Most │ Likely                        │   Least │ Likely                                    │
╞════╪════════╪═══════════════════════════════╪═════════╪═══════════════════════════════════════════╡
│  0 │  -9.13 │ especially                    │   -4.63 │ first_person_pronoun first_person_pronoun │
├────┼────────┼───────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  1 │  -9.13 │ peace god                     │   -4.63 │ want                                      │
├────┼────────┼───────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  2 │  -8.44 │ come pronoun                  │   -4.62 │ family                                    │
├────┼────────┼───────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  3 │  -8.44 │ daughter first_person_pronoun │   -4.44 │ know                                      │
├────┼────────┼───────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  4 │  -8.44 │ doesnt                        │   -4.16 │ love pronoun                              │
├────┼────────┼───────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  5 │  -8.44 │ first_person_pronoun come     │   -4.08 │ pronoun first_person_pronoun              │
├────┼────────┼───────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  6 │  -8.44 │ first_person_pronoun daughter │   -4.00 │ first_person_pronoun love                 │
├────┼────────┼───────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  7 │  -8.44 │ given                         │   -3.83 │ love                                      │
├────┼────────┼───────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  8 │  -8.44 │ given first_person_pronoun    │   -2.40 │ pronoun                                   │
├────┼────────┼───────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  9 │  -8.44 │ god forgive                   │   -1.88 │ first_person_pronoun                      │
╘════╧════════╧═══════════════════════════════╧═════════╧═══════════════════════════════════════════╛
============ time_spent :  10_or_less
╒════╤════════╤══════════════════════════════╤═════════╤═════════════════════════════╕
│    │   Most │ Likely                       │   Least │ Likely                      │
╞════╪════════╪══════════════════════════════╪═════════╪═════════════════════════════╡
│  0 │  -0.95 │ say pronoun                  │    0.49 │ like say                    │
├────┼────────┼──────────────────────────────┼─────────┼─────────────────────────────┤
│  1 │  -0.86 │ soon                         │    0.52 │ first_person_pronoun got    │
├────┼────────┼──────────────────────────────┼─────────┼─────────────────────────────┤
│  2 │  -0.81 │ statement                    │    0.52 │ hold                        │
├────┼────────┼──────────────────────────────┼─────────┼─────────────────────────────┤
│  3 │  -0.74 │ theres                       │    0.52 │ praise                      │
├────┼────────┼──────────────────────────────┼─────────┼─────────────────────────────┤
│  4 │  -0.66 │ blessing                     │    0.53 │ mama                        │
├────┼────────┼──────────────────────────────┼─────────┼─────────────────────────────┤
│  5 │  -0.56 │ kids                         │    0.55 │ first_person_pronoun im     │
├────┼────────┼──────────────────────────────┼─────────┼─────────────────────────────┤
│  6 │  -0.55 │ brought                      │    0.57 │ first_person_pronoun want   │
├────┼────────┼──────────────────────────────┼─────────┼─────────────────────────────┤
│  7 │  -0.54 │ words                        │    0.62 │ good                        │
├────┼────────┼──────────────────────────────┼─────────┼─────────────────────────────┤
│  8 │  -0.53 │ brought first_person_pronoun │    0.66 │ yall im                     │
├────┼────────┼──────────────────────────────┼─────────┼─────────────────────────────┤
│  9 │  -0.47 │ first_person_pronoun heart   │    0.72 │ strong first_person_pronoun │
╘════╧════════╧══════════════════════════════╧═════════╧═════════════════════════════╛

df

TEST 5 -- MNB & SVM with Vectorizer 5¶

vec = bigram_cv_v2
classifier = mnb


model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})

classifier = svm

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})

============ time_spent :  10_or_less
╒════╤════════╤══════════════╤═════════╤═════════════════╕
│    │   Most │ Likely       │   Least │ Likely          │
╞════╪════════╪══════════════╪═════════╪═════════════════╡
│  0 │  -8.72 │ especially   │   -4.49 │ pronoun pronoun │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  1 │  -8.03 │ come pronoun │   -4.46 │ sorry           │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  2 │  -8.03 │ doesnt       │   -4.46 │ thank           │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  3 │  -8.03 │ given        │   -4.41 │ god             │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  4 │  -8.03 │ god forgive  │   -4.22 │ want            │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  5 │  -8.03 │ grace        │   -4.21 │ family          │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  6 │  -8.03 │ jack         │   -4.03 │ know            │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  7 │  -8.03 │ joe          │   -3.75 │ love pronoun    │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  8 │  -8.03 │ john         │   -3.43 │ love            │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  9 │  -8.03 │ lived        │   -1.99 │ pronoun         │
╘════╧════════╧══════════════╧═════════╧═════════════════╛
============ time_spent :  10_or_less
╒════╤════════╤════════════════════╤═════════╤═════════════════╕
│    │   Most │ Likely             │   Least │ Likely          │
╞════╪════════╪════════════════════╪═════════╪═════════════════╡
│  0 │  -1.12 │ say pronoun        │    0.59 │ apologize       │
├────┼────────┼────────────────────┼─────────┼─────────────────┤
│  1 │  -0.84 │ brought            │    0.60 │ committed       │
├────┼────────┼────────────────────┼─────────┼─────────────────┤
│  2 │  -0.78 │ soon               │    0.60 │ reason          │
├────┼────────┼────────────────────┼─────────┼─────────────────┤
│  3 │  -0.76 │ know did           │    0.64 │ good            │
├────┼────────┼────────────────────┼─────────┼─────────────────┤
│  4 │  -0.75 │ statement          │    0.64 │ pronoun support │
├────┼────────┼────────────────────┼─────────┼─────────────────┤
│  5 │  -0.72 │ pronoun appreciate │    0.64 │ pronoun heart   │
├────┼────────┼────────────────────┼─────────┼─────────────────┤
│  6 │  -0.63 │ theres             │    0.71 │ hold            │
├────┼────────┼────────────────────┼─────────┼─────────────────┤
│  7 │  -0.61 │ taking             │    0.73 │ loved pronoun   │
├────┼────────┼────────────────────┼─────────┼─────────────────┤
│  8 │  -0.59 │ blessing           │    0.77 │ yall im         │
├────┼────────┼────────────────────┼─────────┼─────────────────┤
│  9 │  -0.55 │ thanks             │    0.79 │ strong          │
╘════╧════════╧════════════════════╧═════════╧═════════════════╛

df

TEST 6 -- MNB & SVM with Vectorizer 6¶

vec = unigram_tv
classifier = mnb


model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})

classifier = svm

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})

============ time_spent :  10_or_less
╒════╤════════╤════════════╤═════════╤══════════════════════╕
│    │   Most │ Likely     │   Least │ Likely               │
╞════╪════════╪════════════╪═════════╪══════════════════════╡
│  0 │  -6.65 │ especially │   -4.78 │ sorry                │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  1 │  -6.60 │ showed     │   -4.73 │ god                  │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  2 │  -6.59 │ touch      │   -4.72 │ know                 │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  3 │  -6.58 │ given      │   -4.67 │ yall                 │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  4 │  -6.58 │ taking     │   -4.66 │ want                 │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  5 │  -6.57 │ brought    │   -4.65 │ im                   │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  6 │  -6.57 │ john       │   -4.60 │ family               │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  7 │  -6.56 │ leave      │   -4.06 │ love                 │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  8 │  -6.55 │ night      │   -3.06 │ pronoun              │
├────┼────────┼────────────┼─────────┼──────────────────────┤
│  9 │  -6.54 │ thought    │   -2.61 │ first_person_pronoun │
╘════╧════════╧════════════╧═════════╧══════════════════════╛
============ time_spent :  10_or_less
╒════╤════════╤══════════╤═════════╤═══════════╕
│    │   Most │ Likely   │   Least │ Likely    │
╞════╪════════╪══════════╪═════════╪═══════════╡
│  0 │  -1.42 │ taking   │    0.98 │ fear      │
├────┼────────┼──────────┼─────────┼───────────┤
│  1 │  -1.38 │ row      │    0.98 │ today     │
├────┼────────┼──────────┼─────────┼───────────┤
│  2 │  -1.37 │ years    │    0.99 │ david     │
├────┼────────┼──────────┼─────────┼───────────┤
│  3 │  -1.26 │ father   │    1.00 │ everybody │
├────┼────────┼──────────┼─────────┼───────────┤
│  4 │  -1.19 │ ive      │    1.03 │ reason    │
├────┼────────┼──────────┼─────────┼───────────┤
│  5 │  -1.17 │ brought  │    1.04 │ committed │
├────┼────────┼──────────┼─────────┼───────────┤
│  6 │  -1.16 │ ah       │    1.05 │ happening │
├────┼────────┼──────────┼─────────┼───────────┤
│  7 │  -1.12 │ words    │    1.14 │ brother   │
├────┼────────┼──────────┼─────────┼───────────┤
│  8 │  -1.00 │ allah    │    1.20 │ praise    │
├────┼────────┼──────────┼─────────┼───────────┤
│  9 │  -0.99 │ jack     │    1.35 │ strong    │
╘════╧════════╧══════════╧═════════╧═══════════╛

df

TEST 7 -- MNB & SVM with Vectorizer 7¶

vec = unigram_tv_v2
classifier = mnb

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})

classifier = svm

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})

============ time_spent :  10_or_less
╒════╤════════╤════════════╤═════════╤══════════╕
│    │   Most │ Likely     │   Least │ Likely   │
╞════╪════════╪════════════╪═════════╪══════════╡
│  0 │  -6.67 │ especially │   -4.65 │ thank    │
├────┼────────┼────────────┼─────────┼──────────┤
│  1 │  -6.59 │ showed     │   -4.62 │ god      │
├────┼────────┼────────────┼─────────┼──────────┤
│  2 │  -6.59 │ taking     │   -4.62 │ sorry    │
├────┼────────┼────────────┼─────────┼──────────┤
│  3 │  -6.58 │ given      │   -4.59 │ im       │
├────┼────────┼────────────┼─────────┼──────────┤
│  4 │  -6.58 │ john       │   -4.59 │ yall     │
├────┼────────┼────────────┼─────────┼──────────┤
│  5 │  -6.58 │ touch      │   -4.58 │ know     │
├────┼────────┼────────────┼─────────┼──────────┤
│  6 │  -6.56 │ brought    │   -4.53 │ want     │
├────┼────────┼────────────┼─────────┼──────────┤
│  7 │  -6.54 │ leave      │   -4.43 │ family   │
├────┼────────┼────────────┼─────────┼──────────┤
│  8 │  -6.54 │ night      │   -3.93 │ love     │
├────┼────────┼────────────┼─────────┼──────────┤
│  9 │  -6.54 │ soon       │   -2.92 │ pronoun  │
╘════╧════════╧════════════╧═════════╧══════════╛
============ time_spent :  10_or_less
╒════╤════════╤══════════╤═════════╤═══════════╕
│    │   Most │ Likely   │   Least │ Likely    │
╞════╪════════╪══════════╪═════════╪═══════════╡
│  0 │  -1.51 │ taking   │    0.98 │ send      │
├────┼────────┼──────────┼─────────┼───────────┤
│  1 │  -1.36 │ row      │    0.99 │ today     │
├────┼────────┼──────────┼─────────┼───────────┤
│  2 │  -1.28 │ years    │    1.00 │ everybody │
├────┼────────┼──────────┼─────────┼───────────┤
│  3 │  -1.25 │ father   │    1.00 │ fear      │
├────┼────────┼──────────┼─────────┼───────────┤
│  4 │  -1.24 │ ive      │    1.07 │ reason    │
├────┼────────┼──────────┼─────────┼───────────┤
│  5 │  -1.14 │ brought  │    1.08 │ committed │
├────┼────────┼──────────┼─────────┼───────────┤
│  6 │  -1.13 │ ah       │    1.11 │ happening │
├────┼────────┼──────────┼─────────┼───────────┤
│  7 │  -1.11 │ jack     │    1.13 │ praise    │
├────┼────────┼──────────┼─────────┼───────────┤
│  8 │  -1.09 │ words    │    1.18 │ brother   │
├────┼────────┼──────────┼─────────┼───────────┤
│  9 │  -1.02 │ guilty   │    1.28 │ strong    │
╘════╧════════╧══════════╧═════════╧═══════════╛

df

TEST 8 -- MNB & SVM with Vectorizer 8¶

vec = bigram_tv
classifier = mnb

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})

classifier = svm

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})

============ time_spent :  10_or_less
╒════╤════════╤═════════════════════════════╤═════════╤═══════════════════════════════════════════╕
│    │   Most │ Likely                      │   Least │ Likely                                    │
╞════╪════════╪═════════════════════════════╪═════════╪═══════════════════════════════════════════╡
│  0 │  -7.15 │ especially                  │   -5.35 │ first_person_pronoun first_person_pronoun │
├────┼────────┼─────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  1 │  -7.15 │ peace god                   │   -5.34 │ im                                        │
├────┼────────┼─────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  2 │  -7.11 │ first_person_pronoun come   │   -5.34 │ want                                      │
├────┼────────┼─────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  3 │  -7.11 │ lord pronoun                │   -5.30 │ family                                    │
├────┼────────┼─────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  4 │  -7.11 │ guilty first_person_pronoun │   -5.09 │ pronoun first_person_pronoun              │
├────┼────────┼─────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  5 │  -7.11 │ years pronoun               │   -4.99 │ love pronoun                              │
├────┼────────┼─────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  6 │  -7.10 │ showed                      │   -4.97 │ first_person_pronoun love                 │
├────┼────────┼─────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  7 │  -7.09 │ people first_person_pronoun │   -4.77 │ love                                      │
├────┼────────┼─────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  8 │  -7.09 │ jesus first_person_pronoun  │   -3.76 │ pronoun                                   │
├────┼────────┼─────────────────────────────┼─────────┼───────────────────────────────────────────┤
│  9 │  -7.09 │ going first_person_pronoun  │   -3.31 │ first_person_pronoun                      │
╘════╧════════╧═════════════════════════════╧═════════╧═══════════════════════════════════════════╛
============ time_spent :  10_or_less
╒════╤════════╤════════════════════════════╤═════════╤═════════════════════════════╕
│    │   Most │ Likely                     │   Least │ Likely                      │
╞════╪════════╪════════════════════════════╪═════════╪═════════════════════════════╡
│  0 │  -1.18 │ taking                     │    0.87 │ everybody                   │
├────┼────────┼────────────────────────────┼─────────┼─────────────────────────────┤
│  1 │  -1.06 │ death row                  │    0.90 │ love pronoun                │
├────┼────────┼────────────────────────────┼─────────┼─────────────────────────────┤
│  2 │  -1.05 │ years                      │    0.90 │ good                        │
├────┼────────┼────────────────────────────┼─────────┼─────────────────────────────┤
│  3 │  -1.03 │ say pronoun                │    0.93 │ kill pronoun                │
├────┼────────┼────────────────────────────┼─────────┼─────────────────────────────┤
│  4 │  -1.01 │ row                        │    0.93 │ brother                     │
├────┼────────┼────────────────────────────┼─────────┼─────────────────────────────┤
│  5 │  -0.99 │ kids                       │    0.94 │ david                       │
├────┼────────┼────────────────────────────┼─────────┼─────────────────────────────┤
│  6 │  -0.99 │ ive                        │    1.00 │ strong                      │
├────┼────────┼────────────────────────────┼─────────┼─────────────────────────────┤
│  7 │  -0.98 │ thank first_person_pronoun │    1.18 │ strong first_person_pronoun │
├────┼────────┼────────────────────────────┼─────────┼─────────────────────────────┤
│  8 │  -0.96 │ thanks                     │    1.19 │ praise                      │
├────┼────────┼────────────────────────────┼─────────┼─────────────────────────────┤
│  9 │  -0.94 │ first_person_pronoun heart │    1.27 │ first_person_pronoun want   │
╘════╧════════╧════════════════════════════╧═════════╧═════════════════════════════╛

df

TEST 9 -- MNB & SVM with Vectorizer 9¶

vec = bigram_tv_v2
classifier = mnb

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})

classifier = svm

model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})

============ time_spent :  10_or_less
╒════╤════════╤═══════════════╤═════════╤══════════════╕
│    │   Most │ Likely        │   Least │ Likely       │
╞════╪════════╪═══════════════╪═════════╪══════════════╡
│  0 │  -6.99 │ especially    │   -5.11 │ sorry        │
├────┼────────┼───────────────┼─────────┼──────────────┤
│  1 │  -6.94 │ years pronoun │   -5.09 │ im           │
├────┼────────┼───────────────┼─────────┼──────────────┤
│  2 │  -6.93 │ pronoun time  │   -5.09 │ yall         │
├────┼────────┼───────────────┼─────────┼──────────────┤
│  3 │  -6.92 │ showed        │   -5.08 │ god          │
├────┼────────┼───────────────┼─────────┼──────────────┤
│  4 │  -6.91 │ given         │   -5.04 │ know         │
├────┼────────┼───────────────┼─────────┼──────────────┤
│  5 │  -6.91 │ taking        │   -5.00 │ want         │
├────┼────────┼───────────────┼─────────┼──────────────┤
│  6 │  -6.91 │ touch         │   -4.93 │ family       │
├────┼────────┼───────────────┼─────────┼──────────────┤
│  7 │  -6.91 │ john          │   -4.66 │ love pronoun │
├────┼────────┼───────────────┼─────────┼──────────────┤
│  8 │  -6.91 │ pronoun wont  │   -4.43 │ love         │
├────┼────────┼───────────────┼─────────┼──────────────┤
│  9 │  -6.90 │ god forgive   │   -3.41 │ pronoun      │
╘════╧════════╧═══════════════╧═════════╧══════════════╛
============ time_spent :  10_or_less
╒════╤════════╤═════════════╤═════════╤══════════════╕
│    │   Most │ Likely      │   Least │ Likely       │
╞════╪════════╪═════════════╪═════════╪══════════════╡
│  0 │  -1.34 │ know did    │    0.90 │ everybody    │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  1 │  -1.33 │ taking      │    0.91 │ love pronoun │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  2 │  -1.14 │ ive         │    0.92 │ happening    │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  3 │  -1.08 │ death row   │    0.92 │ good         │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  4 │  -1.06 │ tell family │    0.94 │ fear         │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  5 │  -1.06 │ father      │    0.94 │ reason       │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  6 │  -1.05 │ brought     │    1.04 │ brother      │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  7 │  -1.05 │ years       │    1.10 │ praise       │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  8 │  -1.03 │ thanks      │    1.11 │ pronoun dont │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  9 │  -1.00 │ row         │    1.28 │ strong       │
╘════╧════════╧═════════════╧═════════╧══════════════╛

df

	classifier	vectorizer	score
0	mnb	V1	0.511013
1	svm	V1	0.484581
2	mnb	V2	0.511013
3	svm	V2	0.475771
4	mnb	V3	0.466960