HW7: Comparing MNB & SVM with Kaggle Sentiment Data¶

OVERVIEW¶

VECTORIZERS USED:¶

CountVectorizer
TfidfVectorizer

MODELS USED:¶

Multinomial Naive Bayes (MNB)
Support Vector Machines (SVM)

VECTORIZATION PARAMS:¶

Binary
Stopwords
Unigrams, Bigrams
Min & Max df

TODO:¶

Stemming?
Vadar + TextBlob

FUNCTION & PACKAGE PARTY¶

## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', 
                                     token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', 
                             token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', 
                                token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
    clf = classifier
    clf.fit(X_train_vec,y_train)
    y_pred = clf.predict(X_test_vec)
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    score = clf.score(X_test_vec,y_test)
    return clf, score, report
    
def get_model(X, y, labels, target_names, classifier, vec):
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
    return model, score, report
    
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd

def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

def update_big_df(big_df, new_row):
    big_df.append(new_row)
    df = pd.DataFrame(big_df)
    df = df.drop_duplicates()
    return df

DATA GOES HERE:¶

# import pandas as pd
# train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
# y=train['Sentiment'].values
# X=train['Phrase'].values


import pandas as pd
df = pd.read_csv('../death_row_discritized.csv')

def to_string(tokens):
    try:
        return " ".join(eval(tokens))
    except:
        return "error"
    
df['statement_string'] = df.apply(lambda x: to_string(x['last_statement']), axis=1)
y=df['vic_kid'].values
X=df['statement_string'].values

TASK 1¶

TEST 1 -- MNB & SVM with Vectorizer 1¶

big_df = []

vec = unigram_bool_cv_v1
classifier = mnb

model, score, report = get_model(X,y,['yes', 'no'],['yes', 'no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤═══════════╤═════════╤══════════════════════╕
│    │   Most │ Likely    │   Least │ Likely               │
╞════╪════════╪═══════════╪═════════╪══════════════════════╡
│  0 │  -7.32 │ ahead     │   -4.49 │ yes                  │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  1 │  -7.32 │ allowed   │   -4.43 │ god                  │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  2 │  -7.32 │ anger     │   -4.43 │ warden               │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  3 │  -7.32 │ asked     │   -4.38 │ lord                 │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  4 │  -7.32 │ away      │   -4.38 │ want                 │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  5 │  -7.32 │ big       │   -4.28 │ family               │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  6 │  -7.32 │ chance    │   -4.28 │ know                 │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  7 │  -7.32 │ committed │   -3.71 │ love                 │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  8 │  -7.32 │ david     │   -3.47 │ pronoun              │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  9 │  -7.32 │ gon       │   -3.43 │ first_person_pronoun │
╘════╧════════╧═══════════╧═════════╧══════════════════════╛

vec = unigram_bool_cv_v1
classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤═══════════╤═════════╤════════════╕
│    │   Most │ Likely    │   Least │ Likely     │
╞════╪════════╪═══════════╪═════════╪════════════╡
│  0 │  -0.98 │ didnt     │    0.64 │ alright    │
├────┼────────┼───────────┼─────────┼────────────┤
│  1 │  -0.83 │ shown     │    0.66 │ ms         │
├────┼────────┼───────────┼─────────┼────────────┤
│  2 │  -0.80 │ kids      │    0.71 │ supporters │
├────┼────────┼───────────┼─────────┼────────────┤
│  3 │  -0.62 │ aint      │    0.78 │ long       │
├────┼────────┼───────────┼─────────┼────────────┤
│  4 │  -0.62 │ really    │    0.82 │ said       │
├────┼────────┼───────────┼─────────┼────────────┤
│  5 │  -0.61 │ tell      │    0.83 │ doing      │
├────┼────────┼───────────┼─────────┼────────────┤
│  6 │  -0.59 │ mr        │    0.85 │ sins       │
├────┼────────┼───────────┼─────────┼────────────┤
│  7 │  -0.57 │ goodbye   │    0.91 │ bad        │
├────┼────────┼───────────┼─────────┼────────────┤
│  8 │  -0.57 │ state     │    1.02 │ taken      │
├────┼────────┼───────────┼─────────┼────────────┤
│  9 │  -0.56 │ situation │    1.15 │ blessing   │
╘════╧════════╧═══════════╧═════════╧════════════╛

NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not.

TEST 2 -- MNB & SVM with Vectorizer 2¶

vec = unigram_bool_cv_v2
classifier = mnb


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤═══════════╤═════════╤══════════╕
│    │   Most │ Likely    │   Least │ Likely   │
╞════╪════════╪═══════════╪═════════╪══════════╡
│  0 │  -7.29 │ ahead     │   -4.46 │ thank    │
├────┼────────┼───────────┼─────────┼──────────┤
│  1 │  -7.29 │ allowed   │   -4.46 │ yes      │
├────┼────────┼───────────┼─────────┼──────────┤
│  2 │  -7.29 │ anger     │   -4.40 │ god      │
├────┼────────┼───────────┼─────────┼──────────┤
│  3 │  -7.29 │ asked     │   -4.40 │ warden   │
├────┼────────┼───────────┼─────────┼──────────┤
│  4 │  -7.29 │ away      │   -4.35 │ lord     │
├────┼────────┼───────────┼─────────┼──────────┤
│  5 │  -7.29 │ big       │   -4.35 │ want     │
├────┼────────┼───────────┼─────────┼──────────┤
│  6 │  -7.29 │ chance    │   -4.25 │ family   │
├────┼────────┼───────────┼─────────┼──────────┤
│  7 │  -7.29 │ committed │   -4.25 │ know     │
├────┼────────┼───────────┼─────────┼──────────┤
│  8 │  -7.29 │ david     │   -3.68 │ love     │
├────┼────────┼───────────┼─────────┼──────────┤
│  9 │  -7.29 │ gon       │   -3.44 │ pronoun  │
╘════╧════════╧═══════════╧═════════╧══════════╛

vec = unigram_bool_cv_v2
classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤════════════╕
│    │   Most │ Likely   │   Least │ Likely     │
╞════╪════════╪══════════╪═════════╪════════════╡
│  0 │  -0.99 │ didnt    │    0.66 │ ms         │
├────┼────────┼──────────┼─────────┼────────────┤
│  1 │  -0.84 │ shown    │    0.66 │ alright    │
├────┼────────┼──────────┼─────────┼────────────┤
│  2 │  -0.80 │ kids     │    0.71 │ long       │
├────┼────────┼──────────┼─────────┼────────────┤
│  3 │  -0.63 │ really   │    0.71 │ supporters │
├────┼────────┼──────────┼─────────┼────────────┤
│  4 │  -0.63 │ real     │    0.81 │ said       │
├────┼────────┼──────────┼─────────┼────────────┤
│  5 │  -0.61 │ tell     │    0.88 │ sins       │
├────┼────────┼──────────┼─────────┼────────────┤
│  6 │  -0.58 │ state    │    0.93 │ doing      │
├────┼────────┼──────────┼─────────┼────────────┤
│  7 │  -0.57 │ texas    │    0.94 │ bad        │
├────┼────────┼──────────┼─────────┼────────────┤
│  8 │  -0.57 │ aint     │    1.01 │ taken      │
├────┼────────┼──────────┼─────────┼────────────┤
│  9 │  -0.56 │ soon     │    1.14 │ blessing   │
╘════╧════════╧══════════╧═════════╧════════════╛

TEST 3 -- MNB & SVM with Vectorizer 3¶

vec = unigram_cv
classifier = mnb


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤═══════════╤═════════╤══════════╕
│    │   Most │ Likely    │   Least │ Likely   │
╞════╪════════╪═══════════╪═════════╪══════════╡
│  0 │  -7.80 │ ahead     │   -4.43 │ god      │
├────┼────────┼───────────┼─────────┼──────────┤
│  1 │  -7.80 │ allowed   │   -4.43 │ like     │
├────┼────────┼───────────┼─────────┼──────────┤
│  2 │  -7.80 │ anger     │   -4.36 │ lord     │
├────┼────────┼───────────┼─────────┼──────────┤
│  3 │  -7.80 │ asked     │   -4.16 │ thank    │
├────┼────────┼───────────┼─────────┼──────────┤
│  4 │  -7.80 │ away      │   -4.11 │ family   │
├────┼────────┼───────────┼─────────┼──────────┤
│  5 │  -7.80 │ big       │   -4.06 │ sorry    │
├────┼────────┼───────────┼─────────┼──────────┤
│  6 │  -7.80 │ chance    │   -4.04 │ want     │
├────┼────────┼───────────┼─────────┼──────────┤
│  7 │  -7.80 │ committed │   -3.74 │ know     │
├────┼────────┼───────────┼─────────┼──────────┤
│  8 │  -7.80 │ david     │   -3.13 │ love     │
├────┼────────┼───────────┼─────────┼──────────┤
│  9 │  -7.80 │ gon       │   -1.75 │ pronoun  │
╘════╧════════╧═══════════╧═════════╧══════════╛

vec = unigram_cv
classifier = svm


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤════════════╕
│    │   Most │ Likely   │   Least │ Likely     │
╞════╪════════╪══════════╪═════════╪════════════╡
│  0 │  -0.77 │ didnt    │    0.63 │ happening  │
├────┼────────┼──────────┼─────────┼────────────┤
│  1 │  -0.77 │ state    │    0.64 │ said       │
├────┼────────┼──────────┼─────────┼────────────┤
│  2 │  -0.72 │ shown    │    0.65 │ especially │
├────┼────────┼──────────┼─────────┼────────────┤
│  3 │  -0.64 │ wanted   │    0.71 │ supporters │
├────┼────────┼──────────┼─────────┼────────────┤
│  4 │  -0.54 │ kids     │    0.72 │ comfort    │
├────┼────────┼──────────┼─────────┼────────────┤
│  5 │  -0.54 │ youre    │    0.75 │ killing    │
├────┼────────┼──────────┼─────────┼────────────┤
│  6 │  -0.53 │ need     │    0.85 │ taken      │
├────┼────────┼──────────┼─────────┼────────────┤
│  7 │  -0.52 │ thanks   │    0.97 │ sins       │
├────┼────────┼──────────┼─────────┼────────────┤
│  8 │  -0.52 │ looking  │    1.06 │ doing      │
├────┼────────┼──────────┼─────────┼────────────┤
│  9 │  -0.48 │ mercy    │    1.13 │ blessing   │
╘════╧════════╧══════════╧═════════╧════════════╛

TEST 4 -- MNB & SVM with Vectorizer 4¶

vec = bigram_cv
classifier = mnb

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤═════════════════════════╤═════════╤═══════════════════════════════════════════╕
│    │   Most │ Likely                  │   Least │ Likely                                    │
╞════╪════════╪═════════════════════════╪═════════╪═══════════════════════════════════════════╡
│  0 │  -8.51 │ ah first_person_pronoun │   -4.78 │ sorry                                     │
├────┼────────┼─────────────────────────┼─────────┼───────────────────────────────────────────┤
│  1 │  -8.51 │ ahead                   │   -4.75 │ want                                      │
├────┼────────┼─────────────────────────┼─────────┼───────────────────────────────────────────┤
│  2 │  -8.51 │ allowed                 │   -4.66 │ first_person_pronoun first_person_pronoun │
├────┼────────┼─────────────────────────┼─────────┼───────────────────────────────────────────┤
│  3 │  -8.51 │ anger                   │   -4.45 │ know                                      │
├────┼────────┼─────────────────────────┼─────────┼───────────────────────────────────────────┤
│  4 │  -8.51 │ asked                   │   -4.21 │ pronoun first_person_pronoun              │
├────┼────────┼─────────────────────────┼─────────┼───────────────────────────────────────────┤
│  5 │  -8.51 │ away                    │   -4.20 │ love pronoun                              │
├────┼────────┼─────────────────────────┼─────────┼───────────────────────────────────────────┤
│  6 │  -8.51 │ big                     │   -4.06 │ first_person_pronoun love                 │
├────┼────────┼─────────────────────────┼─────────┼───────────────────────────────────────────┤
│  7 │  -8.51 │ chance                  │   -3.84 │ love                                      │
├────┼────────┼─────────────────────────┼─────────┼───────────────────────────────────────────┤
│  8 │  -8.51 │ committed               │   -2.46 │ pronoun                                   │
├────┼────────┼─────────────────────────┼─────────┼───────────────────────────────────────────┤
│  9 │  -8.51 │ david                   │   -1.91 │ first_person_pronoun                      │
╘════╧════════╧═════════════════════════╧═════════╧═══════════════════════════════════════════╛
============ Sentiment Score:  0
╒════╤════════╤════════════════════════════╤═════════╤══════════════════════════════╕
│    │   Most │ Likely                     │   Least │ Likely                       │
╞════╪════════╪════════════════════════════╪═════════╪══════════════════════════════╡
│  0 │  -0.79 │ soon                       │    0.38 │ friends first_person_pronoun │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  1 │  -0.58 │ ready warden               │    0.38 │ forgiven                     │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  2 │  -0.50 │ ready home                 │    0.40 │ way first_person_pronoun     │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  3 │  -0.42 │ like tell                  │    0.43 │ taken                        │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  4 │  -0.41 │ sir first_person_pronoun   │    0.44 │ ready                        │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  5 │  -0.39 │ first_person_pronoun ready │    0.48 │ doing                        │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  6 │  -0.37 │ say pronoun                │    0.49 │ support first_person_pronoun │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  7 │  -0.35 │ want pronoun               │    0.61 │ yes first_person_pronoun     │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  8 │  -0.35 │ want thank                 │    0.75 │ yall pronoun                 │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  9 │  -0.33 │ tell                       │    0.93 │ blessing                     │
╘════╧════════╧════════════════════════════╧═════════╧══════════════════════════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)

df

TEST 5 -- MNB & SVM with Vectorizer 5¶

vec = bigram_cv_v2
classifier = mnb


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤════════════╤═════════╤═════════════════╕
│    │   Most │ Likely     │   Least │ Likely          │
╞════╪════════╪════════════╪═════════╪═════════════════╡
│  0 │  -8.11 │ ahead      │   -4.67 │ lord            │
├────┼────────┼────────────┼─────────┼─────────────────┤
│  1 │  -8.11 │ allowed    │   -4.47 │ thank           │
├────┼────────┼────────────┼─────────┼─────────────────┤
│  2 │  -8.11 │ anger      │   -4.44 │ pronoun pronoun │
├────┼────────┼────────────┼─────────┼─────────────────┤
│  3 │  -8.11 │ asked      │   -4.42 │ family          │
├────┼────────┼────────────┼─────────┼─────────────────┤
│  4 │  -8.11 │ away       │   -4.37 │ sorry           │
├────┼────────┼────────────┼─────────┼─────────────────┤
│  5 │  -8.11 │ big        │   -4.35 │ want            │
├────┼────────┼────────────┼─────────┼─────────────────┤
│  6 │  -8.11 │ chance     │   -4.05 │ know            │
├────┼────────┼────────────┼─────────┼─────────────────┤
│  7 │  -8.11 │ committed  │   -3.79 │ love pronoun    │
├────┼────────┼────────────┼─────────┼─────────────────┤
│  8 │  -8.11 │ david      │   -3.43 │ love            │
├────┼────────┼────────────┼─────────┼─────────────────┤
│  9 │  -8.11 │ didnt kill │   -2.06 │ pronoun         │
╘════╧════════╧════════════╧═════════╧═════════════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════════╤═════════╤═════════════════╕
│    │   Most │ Likely       │   Least │ Likely          │
╞════╪════════╪══════════════╪═════════╪═════════════════╡
│  0 │  -0.78 │ soon         │    0.46 │ want tell       │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  1 │  -0.65 │ ready warden │    0.47 │ taken           │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  2 │  -0.63 │ aint         │    0.49 │ killing         │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  3 │  -0.55 │ ready home   │    0.49 │ support pronoun │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  4 │  -0.48 │ like tell    │    0.52 │ supporters      │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  5 │  -0.43 │ want pronoun │    0.56 │ doing           │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  6 │  -0.42 │ kids         │    0.66 │ sins            │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  7 │  -0.41 │ tell         │    0.67 │ yes             │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  8 │  -0.39 │ didnt        │    0.77 │ yall pronoun    │
├────┼────────┼──────────────┼─────────┼─────────────────┤
│  9 │  -0.39 │ words        │    1.00 │ blessing        │
╘════╧════════╧══════════════╧═════════╧═════════════════╛

df

TEST 6 -- MNB & SVM with Vectorizer 6¶

vec = unigram_tv
classifier = mnb


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤═══════════╤═════════╤══════════════════════╕
│    │   Most │ Likely    │   Least │ Likely               │
╞════╪════════╪═══════════╪═════════╪══════════════════════╡
│  0 │  -6.21 │ ahead     │   -4.90 │ yall                 │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  1 │  -6.21 │ allowed   │   -4.90 │ ready                │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  2 │  -6.21 │ anger     │   -4.89 │ yes                  │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  3 │  -6.21 │ asked     │   -4.87 │ know                 │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  4 │  -6.21 │ away      │   -4.87 │ sorry                │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  5 │  -6.21 │ big       │   -4.86 │ thank                │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  6 │  -6.21 │ chance    │   -4.81 │ im                   │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  7 │  -6.21 │ committed │   -4.40 │ love                 │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  8 │  -6.21 │ david     │   -3.43 │ pronoun              │
├────┼────────┼───────────┼─────────┼──────────────────────┤
│  9 │  -6.21 │ gon       │   -3.00 │ first_person_pronoun │
╘════╧════════╧═══════════╧═════════╧══════════════════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤════════════╕
│    │   Most │ Likely   │   Least │ Likely     │
╞════╪════════╪══════════╪═════════╪════════════╡
│  0 │  -1.08 │ tell     │    0.81 │ forgiven   │
├────┼────────┼──────────┼─────────┼────────────┤
│  1 │  -0.94 │ family   │    0.82 │ okay       │
├────┼────────┼──────────┼─────────┼────────────┤
│  2 │  -0.82 │ shown    │    0.83 │ doing      │
├────┼────────┼──────────┼─────────┼────────────┤
│  3 │  -0.80 │ wanted   │    0.87 │ killing    │
├────┼────────┼──────────┼─────────┼────────────┤
│  4 │  -0.70 │ didnt    │    0.88 │ supporters │
├────┼────────┼──────────┼─────────┼────────────┤
│  5 │  -0.68 │ sir      │    0.94 │ said       │
├────┼────────┼──────────┼─────────┼────────────┤
│  6 │  -0.66 │ going    │    0.96 │ long       │
├────┼────────┼──────────┼─────────┼────────────┤
│  7 │  -0.66 │ help     │    1.04 │ yes        │
├────┼────────┼──────────┼─────────┼────────────┤
│  8 │  -0.64 │ need     │    1.21 │ sins       │
├────┼────────┼──────────┼─────────┼────────────┤
│  9 │  -0.64 │ kids     │    1.40 │ blessing   │
╘════╧════════╧══════════╧═════════╧════════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

df

TEST 7 -- MNB & SVM with Vectorizer 7¶

vec = unigram_tv_v2
classifier = mnb

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤═══════════╤═════════╤══════════╕
│    │   Most │ Likely    │   Least │ Likely   │
╞════╪════════╪═══════════╪═════════╪══════════╡
│  0 │  -6.22 │ ahead     │   -4.91 │ want     │
├────┼────────┼───────────┼─────────┼──────────┤
│  1 │  -6.22 │ allowed   │   -4.86 │ ready    │
├────┼────────┼───────────┼─────────┼──────────┤
│  2 │  -6.22 │ anger     │   -4.84 │ yall     │
├────┼────────┼───────────┼─────────┼──────────┤
│  3 │  -6.22 │ asked     │   -4.83 │ yes      │
├────┼────────┼───────────┼─────────┼──────────┤
│  4 │  -6.22 │ away      │   -4.78 │ im       │
├────┼────────┼───────────┼─────────┼──────────┤
│  5 │  -6.22 │ big       │   -4.77 │ thank    │
├────┼────────┼───────────┼─────────┼──────────┤
│  6 │  -6.22 │ chance    │   -4.77 │ know     │
├────┼────────┼───────────┼─────────┼──────────┤
│  7 │  -6.22 │ committed │   -4.73 │ sorry    │
├────┼────────┼───────────┼─────────┼──────────┤
│  8 │  -6.22 │ david     │   -4.26 │ love     │
├────┼────────┼───────────┼─────────┼──────────┤
│  9 │  -6.22 │ gon       │   -3.28 │ pronoun  │
╘════╧════════╧═══════════╧═════════╧══════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤════════════╕
│    │   Most │ Likely   │   Least │ Likely     │
╞════╪════════╪══════════╪═════════╪════════════╡
│  0 │  -1.06 │ tell     │    0.84 │ alright    │
├────┼────────┼──────────┼─────────┼────────────┤
│  1 │  -0.89 │ wanted   │    0.86 │ doing      │
├────┼────────┼──────────┼─────────┼────────────┤
│  2 │  -0.86 │ shown    │    0.88 │ okay       │
├────┼────────┼──────────┼─────────┼────────────┤
│  3 │  -0.82 │ family   │    0.89 │ supporters │
├────┼────────┼──────────┼─────────┼────────────┤
│  4 │  -0.72 │ sir      │    0.94 │ killing    │
├────┼────────┼──────────┼─────────┼────────────┤
│  5 │  -0.71 │ didnt    │    0.96 │ said       │
├────┼────────┼──────────┼─────────┼────────────┤
│  6 │  -0.69 │ need     │    0.98 │ long       │
├────┼────────┼──────────┼─────────┼────────────┤
│  7 │  -0.69 │ going    │    1.03 │ yes        │
├────┼────────┼──────────┼─────────┼────────────┤
│  8 │  -0.67 │ hope     │    1.23 │ sins       │
├────┼────────┼──────────┼─────────┼────────────┤
│  9 │  -0.65 │ help     │    1.42 │ blessing   │
╘════╧════════╧══════════╧═════════╧════════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

df

TEST 8 -- MNB & SVM with Vectorizer 8¶

vec = bigram_tv
classifier = mnb

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤═════════════════════════╤═════════╤══════════════════════════════╕
│    │   Most │ Likely                  │   Least │ Likely                       │
╞════╪════════╪═════════════════════════╪═════════╪══════════════════════════════╡
│  0 │  -6.80 │ ah first_person_pronoun │   -5.63 │ sorry                        │
├────┼────────┼─────────────────────────┼─────────┼──────────────────────────────┤
│  1 │  -6.80 │ ahead                   │   -5.61 │ thank                        │
├────┼────────┼─────────────────────────┼─────────┼──────────────────────────────┤
│  2 │  -6.80 │ allowed                 │   -5.60 │ im                           │
├────┼────────┼─────────────────────────┼─────────┼──────────────────────────────┤
│  3 │  -6.80 │ anger                   │   -5.59 │ know                         │
├────┼────────┼─────────────────────────┼─────────┼──────────────────────────────┤
│  4 │  -6.80 │ asked                   │   -5.48 │ pronoun first_person_pronoun │
├────┼────────┼─────────────────────────┼─────────┼──────────────────────────────┤
│  5 │  -6.80 │ away                    │   -5.34 │ first_person_pronoun love    │
├────┼────────┼─────────────────────────┼─────────┼──────────────────────────────┤
│  6 │  -6.80 │ big                     │   -5.33 │ love pronoun                 │
├────┼────────┼─────────────────────────┼─────────┼──────────────────────────────┤
│  7 │  -6.80 │ chance                  │   -5.18 │ love                         │
├────┼────────┼─────────────────────────┼─────────┼──────────────────────────────┤
│  8 │  -6.80 │ committed               │   -4.21 │ pronoun                      │
├────┼────────┼─────────────────────────┼─────────┼──────────────────────────────┤
│  9 │  -6.80 │ david                   │   -3.78 │ first_person_pronoun         │
╘════╧════════╧═════════════════════════╧═════════╧══════════════════════════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════════════════════╤═════════╤═══════════════════════════╕
│    │   Most │ Likely                   │   Least │ Likely                    │
╞════╪════════╪══════════════════════════╪═════════╪═══════════════════════════╡
│  0 │  -0.82 │ tell                     │    0.77 │ alright                   │
├────┼────────┼──────────────────────────┼─────────┼───────────────────────────┤
│  1 │  -0.78 │ sir first_person_pronoun │    0.78 │ first_person_pronoun sins │
├────┼────────┼──────────────────────────┼─────────┼───────────────────────────┤
│  2 │  -0.72 │ soon                     │    0.78 │ sins                      │
├────┼────────┼──────────────────────────┼─────────┼───────────────────────────┤
│  3 │  -0.65 │ like tell                │    0.78 │ christ                    │
├────┼────────┼──────────────────────────┼─────────┼───────────────────────────┤
│  4 │  -0.64 │ god                      │    0.81 │ ill pronoun               │
├────┼────────┼──────────────────────────┼─────────┼───────────────────────────┤
│  5 │  -0.64 │ wanted                   │    0.87 │ man first_person_pronoun  │
├────┼────────┼──────────────────────────┼─────────┼───────────────────────────┤
│  6 │  -0.62 │ tell pronoun             │    0.87 │ way first_person_pronoun  │
├────┼────────┼──────────────────────────┼─────────┼───────────────────────────┤
│  7 │  -0.57 │ ready warden             │    0.90 │ yall pronoun              │
├────┼────────┼──────────────────────────┼─────────┼───────────────────────────┤
│  8 │  -0.56 │ family friends           │    1.12 │ yes first_person_pronoun  │
├────┼────────┼──────────────────────────┼─────────┼───────────────────────────┤
│  9 │  -0.55 │ want pronoun             │    1.25 │ blessing                  │
╘════╧════════╧══════════════════════════╧═════════╧═══════════════════════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

df

TEST 9 -- MNB & SVM with Vectorizer 9¶

vec = bigram_tv_v2
classifier = mnb

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤════════════╤═════════╤══════════════╕
│    │   Most │ Likely     │   Least │ Likely       │
╞════╪════════╪════════════╪═════════╪══════════════╡
│  0 │  -6.62 │ ahead      │   -5.40 │ want         │
├────┼────────┼────────────┼─────────┼──────────────┤
│  1 │  -6.62 │ allowed    │   -5.38 │ ready        │
├────┼────────┼────────────┼─────────┼──────────────┤
│  2 │  -6.62 │ anger      │   -5.34 │ im           │
├────┼────────┼────────────┼─────────┼──────────────┤
│  3 │  -6.62 │ asked      │   -5.31 │ thank        │
├────┼────────┼────────────┼─────────┼──────────────┤
│  4 │  -6.62 │ away       │   -5.30 │ yes          │
├────┼────────┼────────────┼─────────┼──────────────┤
│  5 │  -6.62 │ big        │   -5.26 │ sorry        │
├────┼────────┼────────────┼─────────┼──────────────┤
│  6 │  -6.62 │ chance     │   -5.25 │ know         │
├────┼────────┼────────────┼─────────┼──────────────┤
│  7 │  -6.62 │ committed  │   -4.98 │ love pronoun │
├────┼────────┼────────────┼─────────┼──────────────┤
│  8 │  -6.62 │ david      │   -4.82 │ love         │
├────┼────────┼────────────┼─────────┼──────────────┤
│  9 │  -6.62 │ didnt kill │   -3.83 │ pronoun      │
╘════╧════════╧════════════╧═════════╧══════════════╛
============ Sentiment Score:  0
╒════╤════════╤════════════════╤═════════╤══════════════╕
│    │   Most │ Likely         │   Least │ Likely       │
╞════╪════════╪════════════════╪═════════╪══════════════╡
│  0 │  -0.89 │ tell           │    0.79 │ christ       │
├────┼────────┼────────────────┼─────────┼──────────────┤
│  1 │  -0.74 │ tell pronoun   │    0.79 │ supporters   │
├────┼────────┼────────────────┼─────────┼──────────────┤
│  2 │  -0.70 │ like tell      │    0.84 │ ill pronoun  │
├────┼────────┼────────────────┼─────────┼──────────────┤
│  3 │  -0.70 │ soon           │    0.84 │ just want    │
├────┼────────┼────────────────┼─────────┼──────────────┤
│  4 │  -0.70 │ family friends │    0.90 │ know         │
├────┼────────┼────────────────┼─────────┼──────────────┤
│  5 │  -0.67 │ wanted         │    0.90 │ yall pronoun │
├────┼────────┼────────────────┼─────────┼──────────────┤
│  6 │  -0.65 │ hope pronoun   │    0.90 │ alright      │
├────┼────────┼────────────────┼─────────┼──────────────┤
│  7 │  -0.62 │ pronoun love   │    1.06 │ sins         │
├────┼────────┼────────────────┼─────────┼──────────────┤
│  8 │  -0.61 │ state          │    1.10 │ yes          │
├────┼────────┼────────────────┼─────────┼──────────────┤
│  9 │  -0.61 │ want pronoun   │    1.25 │ blessing     │
╘════╧════════╧════════════════╧═════════╧══════════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

df

# pred_vec = bigram_cv_v2

# test = pd.read_csv("kaggle-sentiment/test.tsv", delimiter='\t')
# k_id = test['PhraseId'].values
# k_text = test['Phrase'].values

# k_vec = bigram_cv_v2.transform(k_text)
# k_vec

# def get_kaggle_test_train_vec(X,y,vectorizer):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)
#     X_train_vec = vectorizer.fit_transform(X_train)
#     X_test_vec = vectorizer.transform(X_test)
#     return X_train_vec, X_test_vec, y_train, y_test

# def do_the_kaggle(X,y,vec):
#     X_train_vec, X_test_vec, y_train, y_test = get_kaggle_test_train_vec(X,y,vec)
#     svm_clf = LinearSVC(C=1)
#     prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)
#     kaggle_submission = zip(k_id, prediction)
#     outf=open('kaggle_submission_linearSVC_v5.csv', 'w')
#     outf.write('PhraseId,Sentiment\n')
#     for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')
#     outf.close()
#     print('prediction complete')

# do_the_kaggle(X,y,bigram_cv_v2)

df

	classifier	vectorizer	score
0	mnb	V1	0.775330
1	svm	V1	0.731278
4	mnb	V2	0.770925
5	svm	V2	0.731278
6	mnb	V3	0.753304