HW7: Comparing MNB & SVM with Kaggle Sentiment Data¶

OVERVIEW¶

VECTORIZERS USED:¶

CountVectorizer
TfidfVectorizer

MODELS USED:¶

Multinomial Naive Bayes (MNB)
Support Vector Machines (SVM)

VECTORIZATION PARAMS:¶

Binary
Stopwords
Unigrams, Bigrams
Min & Max df

TODO:¶

Stemming?
Vadar + TextBlob

FUNCTION & PACKAGE PARTY¶

## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', 
                                     token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', 
                             token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', 
                                token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
    clf = classifier
    clf.fit(X_train_vec,y_train)
    y_pred = clf.predict(X_test_vec)
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    score = clf.score(X_test_vec,y_test)
    return clf, score, report
    
def get_model(X, y, labels, target_names, classifier, vec):
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
    return model, score, report
    
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd

def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

def update_big_df(big_df, new_row):
    big_df.append(new_row)
    df = pd.DataFrame(big_df)
    df = df.drop_duplicates()
    return df

DATA GOES HERE:¶

# import pandas as pd
# train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
# y=train['Sentiment'].values
# X=train['Phrase'].values


import pandas as pd
df = pd.read_csv('../death_row_discritized.csv')

def to_string(tokens):
    try:
        return " ".join(eval(tokens))
    except:
        return "error"
    
df['statement_string'] = df.apply(lambda x: to_string(x['last_statement']), axis=1)
# y=df['vic_kid'].values
y=df['vic_police'].values
X=df['statement_string'].values

TASK 1¶

TEST 1 -- MNB & SVM with Vectorizer 1¶

big_df = []

vec = unigram_bool_cv_v1
classifier = mnb

model, score, report = get_model(X,y,['yes', 'no'],['yes', 'no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤══════════════════════╕
│    │   Most │ Likely   │   Least │ Likely               │
╞════╪════════╪══════════╪═════════╪══════════════════════╡
│  0 │  -6.56 │ ah       │   -4.48 │ god                  │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  1 │  -6.56 │ ahead    │   -4.48 │ hope                 │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  2 │  -6.56 │ allah    │   -4.48 │ like                 │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  3 │  -6.56 │ allowed  │   -4.36 │ want                 │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  4 │  -6.56 │ almighty │   -4.25 │ know                 │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  5 │  -6.56 │ alright  │   -4.25 │ thank                │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  6 │  -6.56 │ amen     │   -3.99 │ family               │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  7 │  -6.56 │ anybody  │   -3.99 │ love                 │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  8 │  -6.56 │ art      │   -3.67 │ first_person_pronoun │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  9 │  -6.56 │ away     │   -3.67 │ pronoun              │
╘════╧════════╧══════════╧═════════╧══════════════════════╛

vec = unigram_bool_cv_v1
classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤══════════╕
│    │   Most │ Likely   │   Least │ Likely   │
╞════╪════════╪══════════╪═════════╪══════════╡
│  0 │  -0.54 │ lord     │    0.41 │ guess    │
├────┼────────┼──────────┼─────────┼──────────┤
│  1 │  -0.43 │ brothers │    0.41 │ know     │
├────┼────────┼──────────┼─────────┼──────────┤
│  2 │  -0.41 │ best     │    0.42 │ caused   │
├────┼────────┼──────────┼─────────┼──────────┤
│  3 │  -0.40 │ ones     │    0.50 │ bless    │
├────┼────────┼──────────┼─────────┼──────────┤
│  4 │  -0.40 │ sorry    │    0.51 │ thank    │
├────┼────────┼──────────┼─────────┼──────────┤
│  5 │  -0.40 │ warden   │    0.52 │ sir      │
├────┼────────┼──────────┼─────────┼──────────┤
│  6 │  -0.38 │ strong   │    0.57 │ father   │
├────┼────────┼──────────┼─────────┼──────────┤
│  7 │  -0.38 │ jesus    │    0.60 │ innocent │
├────┼────────┼──────────┼─────────┼──────────┤
│  8 │  -0.38 │ pain     │    0.60 │ deserve  │
├────┼────────┼──────────┼─────────┼──────────┤
│  9 │  -0.37 │ stay     │    1.17 │ mean     │
╘════╧════════╧══════════╧═════════╧══════════╛

NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not.

TEST 2 -- MNB & SVM with Vectorizer 2¶

vec = unigram_bool_cv_v2
classifier = mnb


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤══════════╕
│    │   Most │ Likely   │   Least │ Likely   │
╞════╪════════╪══════════╪═════════╪══════════╡
│  0 │  -6.53 │ ah       │   -4.74 │ sorry    │
├────┼────────┼──────────┼─────────┼──────────┤
│  1 │  -6.53 │ ahead    │   -4.45 │ god      │
├────┼────────┼──────────┼─────────┼──────────┤
│  2 │  -6.53 │ allah    │   -4.45 │ hope     │
├────┼────────┼──────────┼─────────┼──────────┤
│  3 │  -6.53 │ allowed  │   -4.45 │ like     │
├────┼────────┼──────────┼─────────┼──────────┤
│  4 │  -6.53 │ almighty │   -4.33 │ want     │
├────┼────────┼──────────┼─────────┼──────────┤
│  5 │  -6.53 │ alright  │   -4.23 │ know     │
├────┼────────┼──────────┼─────────┼──────────┤
│  6 │  -6.53 │ amen     │   -4.23 │ thank    │
├────┼────────┼──────────┼─────────┼──────────┤
│  7 │  -6.53 │ anybody  │   -3.97 │ family   │
├────┼────────┼──────────┼─────────┼──────────┤
│  8 │  -6.53 │ art      │   -3.97 │ love     │
├────┼────────┼──────────┼─────────┼──────────┤
│  9 │  -6.53 │ away     │   -3.64 │ pronoun  │
╘════╧════════╧══════════╧═════════╧══════════╛

vec = unigram_bool_cv_v2
classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤══════════╕
│    │   Most │ Likely   │   Least │ Likely   │
╞════╪════════╪══════════╪═════════╪══════════╡
│  0 │  -0.54 │ lord     │    0.41 │ little   │
├────┼────────┼──────────┼─────────┼──────────┤
│  1 │  -0.43 │ brothers │    0.42 │ know     │
├────┼────────┼──────────┼─────────┼──────────┤
│  2 │  -0.41 │ best     │    0.42 │ caused   │
├────┼────────┼──────────┼─────────┼──────────┤
│  3 │  -0.41 │ warden   │    0.50 │ bless    │
├────┼────────┼──────────┼─────────┼──────────┤
│  4 │  -0.40 │ ones     │    0.50 │ thank    │
├────┼────────┼──────────┼─────────┼──────────┤
│  5 │  -0.40 │ sorry    │    0.53 │ sir      │
├────┼────────┼──────────┼─────────┼──────────┤
│  6 │  -0.40 │ strong   │    0.58 │ father   │
├────┼────────┼──────────┼─────────┼──────────┤
│  7 │  -0.38 │ pain     │    0.59 │ deserve  │
├────┼────────┼──────────┼─────────┼──────────┤
│  8 │  -0.38 │ stay     │    0.60 │ innocent │
├────┼────────┼──────────┼─────────┼──────────┤
│  9 │  -0.37 │ jesus    │    1.16 │ mean     │
╘════╧════════╧══════════╧═════════╧══════════╛

TEST 3 -- MNB & SVM with Vectorizer 3¶

vec = unigram_cv
classifier = mnb


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤══════════╕
│    │   Most │ Likely   │   Least │ Likely   │
╞════╪════════╪══════════╪═════════╪══════════╡
│  0 │  -6.86 │ ah       │   -4.47 │ like     │
├────┼────────┼──────────┼─────────┼──────────┤
│  1 │  -6.86 │ ahead    │   -4.47 │ peace    │
├────┼────────┼──────────┼─────────┼──────────┤
│  2 │  -6.86 │ allah    │   -4.30 │ god      │
├────┼────────┼──────────┼─────────┼──────────┤
│  3 │  -6.86 │ allowed  │   -4.30 │ want     │
├────┼────────┼──────────┼─────────┼──────────┤
│  4 │  -6.86 │ almighty │   -4.16 │ know     │
├────┼────────┼──────────┼─────────┼──────────┤
│  5 │  -6.86 │ alright  │   -4.16 │ sorry    │
├────┼────────┼──────────┼─────────┼──────────┤
│  6 │  -6.86 │ amen     │   -4.03 │ thank    │
├────┼────────┼──────────┼─────────┼──────────┤
│  7 │  -6.86 │ anybody  │   -3.69 │ family   │
├────┼────────┼──────────┼─────────┼──────────┤
│  8 │  -6.86 │ art      │   -3.37 │ love     │
├────┼────────┼──────────┼─────────┼──────────┤
│  9 │  -6.86 │ away     │   -1.86 │ pronoun  │
╘════╧════════╧══════════╧═════════╧══════════╛

vec = unigram_cv
classifier = svm


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤══════════╕
│    │   Most │ Likely   │   Least │ Likely   │
╞════╪════════╪══════════╪═════════╪══════════╡
│  0 │  -0.46 │ lord     │    0.36 │ friends  │
├────┼────────┼──────────┼─────────┼──────────┤
│  1 │  -0.44 │ warden   │    0.39 │ caused   │
├────┼────────┼──────────┼─────────┼──────────┤
│  2 │  -0.39 │ jesus    │    0.44 │ little   │
├────┼────────┼──────────┼─────────┼──────────┤
│  3 │  -0.38 │ pain     │    0.49 │ deserve  │
├────┼────────┼──────────┼─────────┼──────────┤
│  4 │  -0.37 │ dont     │    0.49 │ innocent │
├────┼────────┼──────────┼─────────┼──────────┤
│  5 │  -0.37 │ ones     │    0.50 │ sir      │
├────┼────────┼──────────┼─────────┼──────────┤
│  6 │  -0.36 │ ready    │    0.51 │ said     │
├────┼────────┼──────────┼─────────┼──────────┤
│  7 │  -0.34 │ strong   │    0.51 │ father   │
├────┼────────┼──────────┼─────────┼──────────┤
│  8 │  -0.34 │ man      │    0.52 │ ive      │
├────┼────────┼──────────┼─────────┼──────────┤
│  9 │  -0.34 │ ok       │    1.08 │ mean     │
╘════╧════════╧══════════╧═════════╧══════════╛

TEST 4 -- MNB & SVM with Vectorizer 4¶

vec = bigram_cv
classifier = mnb

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤════════════════════════════╤═════════╤══════════════════════════════╕
│    │   Most │ Likely                     │   Least │ Likely                       │
╞════╪════════╪════════════════════════════╪═════════╪══════════════════════════════╡
│  0 │  -7.56 │ ah                         │   -4.85 │ know                         │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  1 │  -7.56 │ ah first_person_pronoun    │   -4.85 │ sorry                        │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  2 │  -7.56 │ ahead                      │   -4.73 │ thank                        │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  3 │  -7.56 │ allah                      │   -4.38 │ family                       │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  4 │  -7.56 │ allah first_person_pronoun │   -4.38 │ love pronoun                 │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  5 │  -7.56 │ allowed                    │   -4.19 │ first_person_pronoun love    │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  6 │  -7.56 │ almighty                   │   -4.09 │ pronoun first_person_pronoun │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  7 │  -7.56 │ alright                    │   -4.06 │ love                         │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  8 │  -7.56 │ amen                       │   -2.56 │ pronoun                      │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  9 │  -7.56 │ anybody                    │   -2.30 │ first_person_pronoun         │
╘════╧════════╧════════════════════════════╧═════════╧══════════════════════════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════════════════════════╤═════════╤══════════════════════════════╕
│    │   Most │ Likely                       │   Least │ Likely                       │
╞════╪════════╪══════════════════════════════╪═════════╪══════════════════════════════╡
│  0 │  -0.31 │ warden                       │    0.30 │ goodbye first_person_pronoun │
├────┼────────┼──────────────────────────────┼─────────┼──────────────────────────────┤
│  1 │  -0.31 │ lord                         │    0.32 │ family                       │
├────┼────────┼──────────────────────────────┼─────────┼──────────────────────────────┤
│  2 │  -0.31 │ pronoun family               │    0.34 │ know pronoun                 │
├────┼────────┼──────────────────────────────┼─────────┼──────────────────────────────┤
│  3 │  -0.26 │ jesus                        │    0.35 │ pronoun first_person_pronoun │
├────┼────────┼──────────────────────────────┼─────────┼──────────────────────────────┤
│  4 │  -0.25 │ first_person_pronoun forgive │    0.37 │ know im                      │
├────┼────────┼──────────────────────────────┼─────────┼──────────────────────────────┤
│  5 │  -0.23 │ first_person_pronoun want    │    0.39 │ said                         │
├────┼────────┼──────────────────────────────┼─────────┼──────────────────────────────┤
│  6 │  -0.20 │ better                       │    0.41 │ innocent                     │
├────┼────────┼──────────────────────────────┼─────────┼──────────────────────────────┤
│  7 │  -0.20 │ ill                          │    0.43 │ first_person_pronoun said    │
├────┼────────┼──────────────────────────────┼─────────┼──────────────────────────────┤
│  8 │  -0.19 │ thats                        │    0.44 │ deserve                      │
├────┼────────┼──────────────────────────────┼─────────┼──────────────────────────────┤
│  9 │  -0.19 │ ok                           │    0.53 │ mean                         │
╘════╧════════╧══════════════════════════════╧═════════╧══════════════════════════════╛

df

TEST 5 -- MNB & SVM with Vectorizer 5¶

vec = bigram_cv_v2
classifier = mnb


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤═══════════════════╤═════════╤═══════════════╕
│    │   Most │ Likely            │   Least │ Likely        │
╞════╪════════╪═══════════════════╪═════════╪═══════════════╡
│  0 │  -7.24 │ ah                │   -4.68 │ god           │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  1 │  -7.24 │ ahead             │   -4.68 │ want          │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  2 │  -7.24 │ allah             │   -4.60 │ thank pronoun │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  3 │  -7.24 │ allowed           │   -4.53 │ know          │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  4 │  -7.24 │ almighty          │   -4.53 │ sorry         │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  5 │  -7.24 │ alright           │   -4.41 │ thank         │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  6 │  -7.24 │ amen              │   -4.06 │ family        │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  7 │  -7.24 │ anybody           │   -4.06 │ love pronoun  │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  8 │  -7.24 │ apologize pronoun │   -3.74 │ love          │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  9 │  -7.24 │ art               │   -2.24 │ pronoun       │
╘════╧════════╧═══════════════════╧═════════╧═══════════════╛
============ Sentiment Score:  0
╒════╤════════╤═════════════════╤═════════╤══════════════╕
│    │   Most │ Likely          │   Least │ Likely       │
╞════╪════════╪═════════════════╪═════════╪══════════════╡
│  0 │  -0.37 │ warden          │    0.34 │ friends      │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  1 │  -0.35 │ lord            │    0.34 │ father       │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  2 │  -0.29 │ jesus           │    0.36 │ ive          │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  3 │  -0.27 │ pronoun support │    0.36 │ know pronoun │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  4 │  -0.27 │ pronoun family  │    0.38 │ know im      │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  5 │  -0.26 │ god             │    0.38 │ friends love │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  6 │  -0.25 │ thats           │    0.41 │ innocent     │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  7 │  -0.24 │ ok              │    0.54 │ deserve      │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  8 │  -0.23 │ strong          │    0.56 │ said         │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  9 │  -0.22 │ pain            │    0.67 │ mean         │
╘════╧════════╧═════════════════╧═════════╧══════════════╛

df

TEST 6 -- MNB & SVM with Vectorizer 6¶

vec = unigram_tv
classifier = mnb


model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤══════════════════════╕
│    │   Most │ Likely   │   Least │ Likely               │
╞════╪════════╪══════════╪═════════╪══════════════════════╡
│  0 │  -5.93 │ ah       │   -5.20 │ friends              │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  1 │  -5.93 │ ahead    │   -5.20 │ goodbye              │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  2 │  -5.93 │ allah    │   -5.18 │ want                 │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  3 │  -5.93 │ allowed  │   -5.07 │ know                 │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  4 │  -5.93 │ almighty │   -5.06 │ thank                │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  5 │  -5.93 │ alright  │   -5.05 │ innocent             │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  6 │  -5.93 │ amen     │   -4.94 │ family               │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  7 │  -5.93 │ anybody  │   -4.92 │ love                 │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  8 │  -5.93 │ art      │   -3.98 │ pronoun              │
├────┼────────┼──────────┼─────────┼──────────────────────┤
│  9 │  -5.93 │ away     │   -3.75 │ first_person_pronoun │
╘════╧════════╧══════════╧═════════╧══════════════════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤════════════╕
│    │   Most │ Likely   │   Least │ Likely     │
╞════╪════════╪══════════╪═════════╪════════════╡
│  0 │  -0.68 │ jesus    │    0.66 │ especially │
├────┼────────┼──────────┼─────────┼────────────┤
│  1 │  -0.67 │ lord     │    0.69 │ little     │
├────┼────────┼──────────┼─────────┼────────────┤
│  2 │  -0.62 │ stay     │    0.73 │ family     │
├────┼────────┼──────────┼─────────┼────────────┤
│  3 │  -0.59 │ warden   │    0.75 │ happen     │
├────┼────────┼──────────┼─────────┼────────────┤
│  4 │  -0.58 │ strong   │    0.77 │ friends    │
├────┼────────┼──────────┼─────────┼────────────┤
│  5 │  -0.57 │ im       │    0.81 │ head       │
├────┼────────┼──────────┼─────────┼────────────┤
│  6 │  -0.49 │ death    │    0.85 │ time       │
├────┼────────┼──────────┼─────────┼────────────┤
│  7 │  -0.42 │ ok       │    0.91 │ deserve    │
├────┼────────┼──────────┼─────────┼────────────┤
│  8 │  -0.42 │ better   │    1.16 │ innocent   │
├────┼────────┼──────────┼─────────┼────────────┤
│  9 │  -0.42 │ pain     │    1.31 │ mean       │
╘════╧════════╧══════════╧═════════╧════════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

df

TEST 7 -- MNB & SVM with Vectorizer 7¶

vec = unigram_tv_v2
classifier = mnb

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤══════════╕
│    │   Most │ Likely   │   Least │ Likely   │
╞════╪════════╪══════════╪═════════╪══════════╡
│  0 │  -5.93 │ ah       │   -5.21 │ like     │
├────┼────────┼──────────┼─────────┼──────────┤
│  1 │  -5.93 │ ahead    │   -5.16 │ goodbye  │
├────┼────────┼──────────┼─────────┼──────────┤
│  2 │  -5.93 │ allah    │   -5.11 │ want     │
├────┼────────┼──────────┼─────────┼──────────┤
│  3 │  -5.93 │ allowed  │   -5.11 │ friends  │
├────┼────────┼──────────┼─────────┼──────────┤
│  4 │  -5.93 │ almighty │   -5.03 │ innocent │
├────┼────────┼──────────┼─────────┼──────────┤
│  5 │  -5.93 │ alright  │   -5.01 │ know     │
├────┼────────┼──────────┼─────────┼──────────┤
│  6 │  -5.93 │ amen     │   -4.96 │ thank    │
├────┼────────┼──────────┼─────────┼──────────┤
│  7 │  -5.93 │ anybody  │   -4.82 │ family   │
├────┼────────┼──────────┼─────────┼──────────┤
│  8 │  -5.93 │ art      │   -4.81 │ love     │
├────┼────────┼──────────┼─────────┼──────────┤
│  9 │  -5.93 │ away     │   -3.85 │ pronoun  │
╘════╧════════╧══════════╧═════════╧══════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤════════════╕
│    │   Most │ Likely   │   Least │ Likely     │
╞════╪════════╪══════════╪═════════╪════════════╡
│  0 │  -0.72 │ lord     │    0.59 │ know       │
├────┼────────┼──────────┼─────────┼────────────┤
│  1 │  -0.70 │ jesus    │    0.59 │ family     │
├────┼────────┼──────────┼─────────┼────────────┤
│  2 │  -0.64 │ stay     │    0.67 │ especially │
├────┼────────┼──────────┼─────────┼────────────┤
│  3 │  -0.61 │ death    │    0.70 │ little     │
├────┼────────┼──────────┼─────────┼────────────┤
│  4 │  -0.60 │ strong   │    0.74 │ happen     │
├────┼────────┼──────────┼─────────┼────────────┤
│  5 │  -0.60 │ warden   │    0.77 │ time       │
├────┼────────┼──────────┼─────────┼────────────┤
│  6 │  -0.54 │ pain     │    0.80 │ deserve    │
├────┼────────┼──────────┼─────────┼────────────┤
│  7 │  -0.54 │ im       │    0.81 │ head       │
├────┼────────┼──────────┼─────────┼────────────┤
│  8 │  -0.52 │ way      │    1.12 │ innocent   │
├────┼────────┼──────────┼─────────┼────────────┤
│  9 │  -0.48 │ ms       │    1.27 │ mean       │
╘════╧════════╧══════════╧═════════╧════════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

df

TEST 8 -- MNB & SVM with Vectorizer 8¶

vec = bigram_tv
classifier = mnb

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤════════════════════════════╤═════════╤══════════════════════════════╕
│    │   Most │ Likely                     │   Least │ Likely                       │
╞════╪════════╪════════════════════════════╪═════════╪══════════════════════════════╡
│  0 │  -6.58 │ ah                         │   -5.88 │ love pronoun                 │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  1 │  -6.58 │ ah first_person_pronoun    │   -5.85 │ thank                        │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  2 │  -6.58 │ ahead                      │   -5.85 │ know                         │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  3 │  -6.58 │ allah                      │   -5.77 │ first_person_pronoun love    │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  4 │  -6.58 │ allah first_person_pronoun │   -5.75 │ innocent                     │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  5 │  -6.58 │ allowed                    │   -5.74 │ family                       │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  6 │  -6.58 │ almighty                   │   -5.71 │ love                         │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  7 │  -6.58 │ alright                    │   -5.68 │ pronoun first_person_pronoun │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  8 │  -6.58 │ amen                       │   -4.81 │ pronoun                      │
├────┼────────┼────────────────────────────┼─────────┼──────────────────────────────┤
│  9 │  -6.58 │ anybody                    │   -4.60 │ first_person_pronoun         │
╘════╧════════╧════════════════════════════╧═════════╧══════════════════════════════╛
============ Sentiment Score:  0
╒════╤════════╤═══════════════════════════════════════════╤═════════╤══════════════════════════════╕
│    │   Most │ Likely                                    │   Least │ Likely                       │
╞════╪════════╪═══════════════════════════════════════════╪═════════╪══════════════════════════════╡
│  0 │  -0.62 │ jesus                                     │    0.57 │ first_person_pronoun said    │
├────┼────────┼───────────────────────────────────────────┼─────────┼──────────────────────────────┤
│  1 │  -0.60 │ lord                                      │    0.59 │ happen                       │
├────┼────────┼───────────────────────────────────────────┼─────────┼──────────────────────────────┤
│  2 │  -0.55 │ im                                        │    0.62 │ know im                      │
├────┼────────┼───────────────────────────────────────────┼─────────┼──────────────────────────────┤
│  3 │  -0.54 │ warden                                    │    0.65 │ family                       │
├────┼────────┼───────────────────────────────────────────┼─────────┼──────────────────────────────┤
│  4 │  -0.49 │ stay                                      │    0.71 │ head                         │
├────┼────────┼───────────────────────────────────────────┼─────────┼──────────────────────────────┤
│  5 │  -0.48 │ pronoun family                            │    0.73 │ goodbye first_person_pronoun │
├────┼────────┼───────────────────────────────────────────┼─────────┼──────────────────────────────┤
│  6 │  -0.46 │ strong                                    │    0.81 │ pronoun first_person_pronoun │
├────┼────────┼───────────────────────────────────────────┼─────────┼──────────────────────────────┤
│  7 │  -0.46 │ first_person_pronoun                      │    0.83 │ deserve                      │
├────┼────────┼───────────────────────────────────────────┼─────────┼──────────────────────────────┤
│  8 │  -0.45 │ first_person_pronoun first_person_pronoun │    0.92 │ mean                         │
├────┼────────┼───────────────────────────────────────────┼─────────┼──────────────────────────────┤
│  9 │  -0.44 │ better                                    │    1.13 │ innocent                     │
╘════╧════════╧═══════════════════════════════════════════╧═════════╧══════════════════════════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

df

TEST 9 -- MNB & SVM with Vectorizer 9¶

vec = bigram_tv_v2
classifier = mnb

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})

classifier = svm

model, score, report = get_model(X,y,['yes','no'],['yes','no'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤═══════════════════╤═════════╤═══════════════╕
│    │   Most │ Likely            │   Least │ Likely        │
╞════╪════════╪═══════════════════╪═════════╪═══════════════╡
│  0 │  -6.38 │ ah                │   -5.68 │ want          │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  1 │  -6.38 │ ahead             │   -5.68 │ goodbye       │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  2 │  -6.38 │ allah             │   -5.61 │ thank pronoun │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  3 │  -6.38 │ allowed           │   -5.57 │ know          │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  4 │  -6.38 │ almighty          │   -5.57 │ love pronoun  │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  5 │  -6.38 │ alright           │   -5.54 │ thank         │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  6 │  -6.38 │ amen              │   -5.53 │ innocent      │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  7 │  -6.38 │ anybody           │   -5.43 │ family        │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  8 │  -6.38 │ apologize pronoun │   -5.39 │ love          │
├────┼────────┼───────────────────┼─────────┼───────────────┤
│  9 │  -6.38 │ art               │   -4.46 │ pronoun       │
╘════╧════════╧═══════════════════╧═════════╧═══════════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════╤═════════╤════════════════╕
│    │   Most │ Likely   │   Least │ Likely         │
╞════╪════════╪══════════╪═════════╪════════════════╡
│  0 │  -0.65 │ lord     │    0.54 │ pronoun head   │
├────┼────────┼──────────┼─────────┼────────────────┤
│  1 │  -0.65 │ jesus    │    0.56 │ pronoun sister │
├────┼────────┼──────────┼─────────┼────────────────┤
│  2 │  -0.61 │ warden   │    0.56 │ especially     │
├────┼────────┼──────────┼─────────┼────────────────┤
│  3 │  -0.55 │ death    │    0.56 │ know pronoun   │
├────┼────────┼──────────┼─────────┼────────────────┤
│  4 │  -0.55 │ im       │    0.61 │ know im        │
├────┼────────┼──────────┼─────────┼────────────────┤
│  5 │  -0.45 │ pray     │    0.67 │ happen         │
├────┼────────┼──────────┼─────────┼────────────────┤
│  6 │  -0.45 │ way      │    0.75 │ head           │
├────┼────────┼──────────┼─────────┼────────────────┤
│  7 │  -0.42 │ strong   │    0.87 │ deserve        │
├────┼────────┼──────────┼─────────┼────────────────┤
│  8 │  -0.42 │ thats    │    0.93 │ mean           │
├────┼────────┼──────────┼─────────┼────────────────┤
│  9 │  -0.41 │ stay     │    1.08 │ innocent       │
╘════╧════════╧══════════╧═════════╧════════════════╛

/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)

df

# pred_vec = bigram_cv_v2

# test = pd.read_csv("kaggle-sentiment/test.tsv", delimiter='\t')
# k_id = test['PhraseId'].values
# k_text = test['Phrase'].values

# k_vec = bigram_cv_v2.transform(k_text)
# k_vec

# def get_kaggle_test_train_vec(X,y,vectorizer):
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)
#     X_train_vec = vectorizer.fit_transform(X_train)
#     X_test_vec = vectorizer.transform(X_test)
#     return X_train_vec, X_test_vec, y_train, y_test

# def do_the_kaggle(X,y,vec):
#     X_train_vec, X_test_vec, y_train, y_test = get_kaggle_test_train_vec(X,y,vec)
#     svm_clf = LinearSVC(C=1)
#     prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)
#     kaggle_submission = zip(k_id, prediction)
#     outf=open('kaggle_submission_linearSVC_v5.csv', 'w')
#     outf.write('PhraseId,Sentiment\n')
#     for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')
#     outf.close()
#     print('prediction complete')

# do_the_kaggle(X,y,bigram_cv_v2)

df

	classifier	vectorizer	score
0	mnb	V1	0.881057
1	svm	V1	0.845815
2	mnb	V2	0.881057
3	svm	V2	0.845815
4	mnb	V3	0.867841