HW7: Comparing MNB & SVM with Kaggle Sentiment Data¶

OVERVIEW¶

VECTORIZERS USED:¶

CountVectorizer
TfidfVectorizer

MODELS USED:¶

Multinomial Naive Bayes (MNB)
Support Vector Machines (SVM)

VECTORIZATION PARAMS:¶

Binary
Stopwords
Unigrams, Bigrams
Min & Max df

TODO:¶

Stemming?
Vadar + TextBlob

FUNCTION & PACKAGE PARTY¶

## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

## =======================================================
## PREPROCESSING
## =======================================================

# FIRST - removing anything with 3 or fewer characters
# def my_preprocessor(doc):
# #     print('PREPROCESSING!!!!!')
#     if len(doc) > 3:
#         return(doc)
#     else:
#         return('none')
    
def my_preprocessor(doc):
#     print('PREPROCESSING!!!!!')
    if len(doc) > 2:
        return(doc)
    else:
        return('empty')

## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', 
                                     token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', 
                             token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', 
                                token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## ----- VECTORIZERS with PREPROCESSING

unigram_tv_v3 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', 
                                preprocessor=my_preprocessor, token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
bigram_tv_v3 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5,  
                               preprocessor=my_preprocessor, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv_v4 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2),  
                               preprocessor=my_preprocessor, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv_v5 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=3,
                               preprocessor=my_preprocessor, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
    clf = classifier
    clf.fit(X_train_vec,y_train)
    y_pred = clf.predict(X_test_vec)
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    score = clf.score(X_test_vec,y_test)
    return clf, score, report
    
def get_model(X, y, labels, target_names, classifier, vec):
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
    return model, score, report
    
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd

def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

def update_big_df(big_df, new_row):
    big_df.append(new_row)
    df = pd.DataFrame(big_df)
    df = df.drop_duplicates()
    return df

DATA GOES HERE:¶

# import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
big_df = []

train=pd.read_csv("../HW2/hw7_data_sentiment.csv")
train.head()
# y=train['labels'].values
# X=train['pruned'].values

def remove_na(string):
#     print(type(string))
    if type(string) == str:
        return string
    else:
        return "empty"
train['pruned_2'] = train.apply(lambda x: remove_na(x['pruned']), axis= 1)

y=train['labels'].values
X=train['pruned_2'].values

big_df = []
train.head()

TASK 1¶

With Preprocessing¶

vec = unigram_tv_v3
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤═══════════╤═════════╤════════════╕
│    │   Most │ Likely    │   Least │ Likely     │
╞════╪════════╪═══════════╪═════════╪════════════╡
│  0 │  -9.92 │ aaliyah   │   -6.55 │ time       │
├────┼────────┼───────────┼─────────┼────────────┤
│  1 │  -9.92 │ abandoned │   -6.55 │ characters │
├────┼────────┼───────────┼─────────┼────────────┤
│  2 │  -9.92 │ abbott    │   -6.47 │ comedy     │
├────┼────────┼───────────┼─────────┼────────────┤
│  3 │  -9.92 │ abdul     │   -6.45 │ dull       │
├────┼────────┼───────────┼─────────┼────────────┤
│  4 │  -9.92 │ abel      │   -6.37 │ minutes    │
├────┼────────┼───────────┼─────────┼────────────┤
│  5 │  -9.92 │ ably      │   -6.12 │ worst      │
├────┼────────┼───────────┼─────────┼────────────┤
│  6 │  -9.92 │ aborted   │   -6.09 │ just       │
├────┼────────┼───────────┼─────────┼────────────┤
│  7 │  -9.92 │ abound    │   -5.92 │ like       │
├────┼────────┼───────────┼─────────┼────────────┤
│  8 │  -9.92 │ abrahams  │   -5.75 │ film       │
├────┼────────┼───────────┼─────────┼────────────┤
│  9 │  -9.92 │ abridged  │   -4.98 │ movie      │
╘════╧════════╧═══════════╧═════════╧════════════╛
============ Sentiment Score:  1
╒════╤════════╤══════════════╤═════════╤════════════╕
│    │   Most │ Likely       │   Least │ Likely     │
╞════╪════════╪══════════════╪═════════╪════════════╡
│  0 │ -10.63 │ abel         │   -6.35 │ time       │
├────┼────────┼──────────────┼─────────┼────────────┤
│  1 │ -10.63 │ ably         │   -6.28 │ plot       │
├────┼────────┼──────────────┼─────────┼────────────┤
│  2 │ -10.63 │ abound       │   -6.20 │ characters │
├────┼────────┼──────────────┼─────────┼────────────┤
│  3 │ -10.63 │ abrahams     │   -6.00 │ story      │
├────┼────────┼──────────────┼─────────┼────────────┤
│  4 │ -10.63 │ abroad       │   -5.97 │ little     │
├────┼────────┼──────────────┼─────────┼────────────┤
│  5 │ -10.63 │ absorb       │   -5.91 │ does       │
├────┼────────┼──────────────┼─────────┼────────────┤
│  6 │ -10.63 │ accentuating │   -5.91 │ just       │
├────┼────────┼──────────────┼─────────┼────────────┤
│  7 │ -10.63 │ access       │   -5.67 │ like       │
├────┼────────┼──────────────┼─────────┼────────────┤
│  8 │ -10.63 │ accidental   │   -5.41 │ film       │
├────┼────────┼──────────────┼─────────┼────────────┤
│  9 │ -10.63 │ acclaim      │   -5.13 │ movie      │
╘════╧════════╧══════════════╧═════════╧════════════╛
============ Sentiment Score:  2
╒════╤════════╤════════════════╤═════════╤════════════╕
│    │   Most │ Likely         │   Least │ Likely     │
╞════╪════════╪════════════════╪═════════╪════════════╡
│  0 │ -11.26 │ acclaim        │   -6.21 │ little     │
├────┼────────┼────────────────┼─────────┼────────────┤
│  1 │ -11.26 │ act            │   -6.17 │ movies     │
├────┼────────┼────────────────┼─────────┼────────────┤
│  2 │ -11.26 │ acumen         │   -6.09 │ rrb        │
├────┼────────┼────────────────┼─────────┼────────────┤
│  3 │ -11.26 │ adding         │   -6.09 │ characters │
├────┼────────┼────────────────┼─────────┼────────────┤
│  4 │ -11.26 │ admirers       │   -6.03 │ life       │
├────┼────────┼────────────────┼─────────┼────────────┤
│  5 │ -11.26 │ adorability    │   -5.90 │ time       │
├────┼────────┼────────────────┼─────────┼────────────┤
│  6 │ -11.26 │ affectionately │   -5.78 │ story      │
├────┼────────┼────────────────┼─────────┼────────────┤
│  7 │ -11.26 │ affirms        │   -5.76 │ like       │
├────┼────────┼────────────────┼─────────┼────────────┤
│  8 │ -11.26 │ ailments       │   -5.12 │ movie      │
├────┼────────┼────────────────┼─────────┼────────────┤
│  9 │ -11.26 │ airhead        │   -5.07 │ film       │
╘════╧════════╧════════════════╧═════════╧════════════╛
============ Sentiment Score:  3
╒════╤════════╤═════════════╤═════════╤══════════╕
│    │   Most │ Likely      │   Least │ Likely   │
╞════╪════════╪═════════════╪═════════╪══════════╡
│  0 │ -10.77 │ aaliyah     │   -6.14 │ best     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  1 │ -10.77 │ abbott      │   -6.12 │ life     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  2 │ -10.77 │ abdul       │   -6.12 │ time     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  3 │ -10.77 │ abhorrent   │   -6.11 │ like     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  4 │ -10.77 │ abomination │   -6.01 │ love     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  5 │ -10.77 │ aborted     │   -6.01 │ story    │
├────┼────────┼─────────────┼─────────┼──────────┤
│  6 │ -10.77 │ abridged    │   -5.61 │ funny    │
├────┼────────┼─────────────┼─────────┼──────────┤
│  7 │ -10.77 │ abrupt      │   -5.32 │ movie    │
├────┼────────┼─────────────┼─────────┼──────────┤
│  8 │ -10.77 │ absence     │   -5.26 │ good     │
├────┼────────┼─────────────┼─────────┼──────────┤
│  9 │ -10.77 │ absent      │   -5.09 │ film     │
╘════╧════════╧═════════════╧═════════╧══════════╛
============ Sentiment Score:  4
╒════╤════════╤═════════════╤═════════╤══════════════╕
│    │   Most │ Likely      │   Least │ Likely       │
╞════╪════════╪═════════════╪═════════╪══════════════╡
│  0 │ -10.04 │ aaliyah     │   -6.19 │ entertaining │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  1 │ -10.04 │ abandon     │   -6.18 │ comedy       │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  2 │ -10.04 │ abandoned   │   -6.12 │ performance  │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  3 │ -10.04 │ abbott      │   -6.03 │ performances │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  4 │ -10.04 │ abdul       │   -5.92 │ great        │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  5 │ -10.04 │ abhorrent   │   -5.79 │ good         │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  6 │ -10.04 │ abject      │   -5.62 │ funny        │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  7 │ -10.04 │ ably        │   -5.47 │ movie        │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  8 │ -10.04 │ abomination │   -5.32 │ best         │
├────┼────────┼─────────────┼─────────┼──────────────┤
│  9 │ -10.04 │ aborted     │   -5.09 │ film         │
╘════╧════════╧═════════════╧═════════╧══════════════╛

vec = bigram_tv_v3
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤═══════════════════╤═════════╤════════════╕
│    │   Most │ Likely            │   Least │ Likely     │
╞════╪════════╪═══════════════════╪═════════╪════════════╡
│  0 │ -10.71 │ aaliyah           │   -7.63 │ characters │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  1 │ -10.71 │ abandon political │   -7.63 │ time       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  2 │ -10.71 │ abandoned         │   -7.53 │ comedy     │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  3 │ -10.71 │ abbott            │   -7.48 │ minutes    │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  4 │ -10.71 │ abdul             │   -7.47 │ dull       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  5 │ -10.71 │ abdul malik       │   -7.27 │ worst      │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  6 │ -10.71 │ abel              │   -7.12 │ just       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  7 │ -10.71 │ abel ferrara      │   -7.03 │ like       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  8 │ -10.71 │ ability document  │   -6.83 │ film       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  9 │ -10.71 │ ability images    │   -6.08 │ movie      │
╘════╧════════╧═══════════════════╧═════════╧════════════╛
============ Sentiment Score:  1
╒════╤════════╤═══════════════════╤═════════╤════════════╕
│    │   Most │ Likely            │   Least │ Likely     │
╞════╪════════╪═══════════════════╪═════════╪════════════╡
│  0 │ -11.16 │ abandon political │   -7.18 │ time       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  1 │ -11.16 │ abel              │   -7.07 │ plot       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  2 │ -11.16 │ abel ferrara      │   -6.97 │ characters │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  3 │ -11.16 │ ability document  │   -6.83 │ story      │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  4 │ -11.16 │ ability images    │   -6.80 │ little     │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  5 │ -11.16 │ ability make      │   -6.75 │ does       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  6 │ -11.16 │ ability spoof     │   -6.71 │ just       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  7 │ -11.16 │ abject suffering  │   -6.50 │ like       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  8 │ -11.16 │ able accomplish   │   -6.24 │ film       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  9 │ -11.16 │ able look         │   -5.97 │ movie      │
╘════╧════════╧═══════════════════╧═════════╧════════════╛
============ Sentiment Score:  2
╒════╤════════╤══════════════════════╤═════════╤════════════╕
│    │   Most │ Likely               │   Least │ Likely     │
╞════╪════════╪══════════════════════╪═════════╪════════════╡
│  0 │ -11.64 │ abandon theater      │   -6.89 │ little     │
├────┼────────┼──────────────────────┼─────────┼────────────┤
│  1 │ -11.64 │ able performances    │   -6.87 │ movies     │
├────┼────────┼──────────────────────┼─────────┼────────────┤
│  2 │ -11.64 │ able project         │   -6.80 │ characters │
├────┼────────┼──────────────────────┼─────────┼────────────┤
│  3 │ -11.64 │ able tear            │   -6.75 │ life       │
├────┼────────┼──────────────────────┼─────────┼────────────┤
│  4 │ -11.64 │ abrupt drop          │   -6.71 │ rrb        │
├────┼────────┼──────────────────────┼─────────┼────────────┤
│  5 │ -11.64 │ absolute delight     │   -6.62 │ time       │
├────┼────────┼──────────────────────┼─────────┼────────────┤
│  6 │ -11.64 │ absolutely amazing   │   -6.50 │ story      │
├────┼────────┼──────────────────────┼─────────┼────────────┤
│  7 │ -11.64 │ absolutely essential │   -6.46 │ like       │
├────┼────────┼──────────────────────┼─────────┼────────────┤
│  8 │ -11.64 │ absorbing look       │   -5.85 │ movie      │
├────┼────────┼──────────────────────┼─────────┼────────────┤
│  9 │ -11.64 │ absorbing piece      │   -5.81 │ film       │
╘════╧════════╧══════════════════════╧═════════╧════════════╛
============ Sentiment Score:  3
╒════╤════════╤═════════════════════╤═════════╤══════════╕
│    │   Most │ Likely              │   Least │ Likely   │
╞════╪════════╪═════════════════════╪═════════╪══════════╡
│  0 │ -11.26 │ aaliyah             │   -6.95 │ time     │
├────┼────────┼─────────────────────┼─────────┼──────────┤
│  1 │ -11.26 │ abandon theater     │   -6.95 │ comedy   │
├────┼────────┼─────────────────────┼─────────┼──────────┤
│  2 │ -11.26 │ abbott              │   -6.95 │ life     │
├────┼────────┼─────────────────────┼─────────┼──────────┤
│  3 │ -11.26 │ abdul               │   -6.94 │ like     │
├────┼────────┼─────────────────────┼─────────┼──────────┤
│  4 │ -11.26 │ abdul malik         │   -6.84 │ love     │
├────┼────────┼─────────────────────┼─────────┼──────────┤
│  5 │ -11.26 │ abhorrent           │   -6.79 │ story    │
├────┼────────┼─────────────────────┼─────────┼──────────┤
│  6 │ -11.26 │ abhorrent abhorrent │   -6.42 │ funny    │
├────┼────────┼─────────────────────┼─────────┼──────────┤
│  7 │ -11.26 │ able enjoy          │   -6.15 │ movie    │
├────┼────────┼─────────────────────┼─────────┼──────────┤
│  8 │ -11.26 │ able project        │   -6.09 │ good     │
├────┼────────┼─────────────────────┼─────────┼──────────┤
│  9 │ -11.26 │ abomination         │   -5.90 │ film     │
╘════╧════════╧═════════════════════╧═════════╧══════════╛
============ Sentiment Score:  4
╒════╤════════╤═══════════════════╤═════════╤══════════════╕
│    │   Most │ Likely            │   Least │ Likely       │
╞════╪════════╪═══════════════════╪═════════╪══════════════╡
│  0 │ -10.78 │ aaliyah           │   -7.22 │ comedy       │
├────┼────────┼───────────────────┼─────────┼──────────────┤
│  1 │ -10.78 │ abandon           │   -7.15 │ entertaining │
├────┼────────┼───────────────────┼─────────┼──────────────┤
│  2 │ -10.78 │ abandon political │   -7.08 │ performance  │
├────┼────────┼───────────────────┼─────────┼──────────────┤
│  3 │ -10.78 │ abandon scripts   │   -7.00 │ performances │
├────┼────────┼───────────────────┼─────────┼──────────────┤
│  4 │ -10.78 │ abandon theater   │   -6.99 │ great        │
├────┼────────┼───────────────────┼─────────┼──────────────┤
│  5 │ -10.78 │ abandoned         │   -6.89 │ good         │
├────┼────────┼───────────────────┼─────────┼──────────────┤
│  6 │ -10.78 │ abbott            │   -6.64 │ funny        │
├────┼────────┼───────────────────┼─────────┼──────────────┤
│  7 │ -10.78 │ abdul             │   -6.53 │ movie        │
├────┼────────┼───────────────────┼─────────┼──────────────┤
│  8 │ -10.78 │ abdul malik       │   -6.45 │ best         │
├────┼────────┼───────────────────┼─────────┼──────────────┤
│  9 │ -10.78 │ abhorrent         │   -6.13 │ film         │
╘════╧════════╧═══════════════════╧═════════╧══════════════╛

vec = bigram_tv_v3
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤══════════════════════╤═════════╤══════════════════════╕
│    │   Most │ Likely               │   Least │ Likely               │
╞════╪════════╪══════════════════════╪═════════╪══════════════════════╡
│  0 │  -1.32 │ hawke                │    2.00 │ entirely witless     │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  1 │  -1.22 │ loving               │    2.01 │ hideously            │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  2 │  -1.21 │ works minutes        │    2.02 │ time stinker         │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  3 │  -1.13 │ flick film           │    2.06 │ disgusting           │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  4 │  -1.13 │ movie does           │    2.08 │ minded stereotypical │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  5 │  -1.12 │ mind right           │    2.12 │ unappealing          │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  6 │  -1.11 │ watching documentary │    2.18 │ unwatchable          │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  7 │  -1.11 │ fair share           │    2.19 │ film barely          │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  8 │  -1.08 │ film make            │    2.34 │ premise just         │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  9 │  -1.07 │ trash cinema         │    2.71 │ disappointment       │
╘════╧════════╧══════════════════════╧═════════╧══════════════════════╛
============ Sentiment Score:  1
╒════╤════════╤═════════════════════════╤═════════╤═════════════════════╕
│    │   Most │ Likely                  │   Least │ Likely              │
╞════╪════════╪═════════════════════════╪═════════╪═════════════════════╡
│  0 │  -1.90 │ hard stop               │    2.04 │ animation years     │
├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤
│  1 │  -1.87 │ unlikable uninteresting │    2.05 │ informative titular │
├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤
│  2 │  -1.74 │ sensibility             │    2.06 │ does live           │
├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤
│  3 │  -1.74 │ minded stereotypical    │    2.07 │ pretty mediocre     │
├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤
│  4 │  -1.67 │ degenerating pious      │    2.09 │ losing touch        │
├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤
│  5 │  -1.63 │ calls                   │    2.09 │ organic intrigue    │
├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤
│  6 │  -1.62 │ movie quirky            │    2.19 │ chops looks         │
├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤
│  7 │  -1.58 │ dull tagline            │    2.21 │ note performance    │
├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤
│  8 │  -1.58 │ contrived overblown     │    2.34 │ suck suck           │
├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤
│  9 │  -1.56 │ film barely             │    2.57 │ funny entertaining  │
╘════╧════════╧═════════════════════════╧═════════╧═════════════════════╛
============ Sentiment Score:  2
╒════╤════════╤═══════════════╤═════════╤════════════════════╕
│    │   Most │ Likely        │   Least │ Likely             │
╞════╪════════╪═══════════════╪═════════╪════════════════════╡
│  0 │  -2.79 │ delightful    │    1.76 │ sweet smile        │
├────┼────────┼───────────────┼─────────┼────────────────────┤
│  1 │  -2.56 │ beautiful     │    1.83 │ hours does         │
├────┼────────┼───────────────┼─────────┼────────────────────┤
│  2 │  -2.53 │ wasted        │    1.84 │ smarter smarter    │
├────┼────────┼───────────────┼─────────┼────────────────────┤
│  3 │  -2.52 │ beautifully   │    1.85 │ budget movie       │
├────┼────────┼───────────────┼─────────┼────────────────────┤
│  4 │  -2.42 │ stunning      │    1.89 │ morally superior   │
├────┼────────┼───────────────┼─────────┼────────────────────┤
│  5 │  -2.41 │ terrific      │    1.90 │ summer divine      │
├────┼────────┼───────────────┼─────────┼────────────────────┤
│  6 │  -2.31 │ perfect       │    2.03 │ like big           │
├────┼────────┼───────────────┼─────────┼────────────────────┤
│  7 │  -2.27 │ unimaginative │    2.06 │ enjoy mindless     │
├────┼────────┼───────────────┼─────────┼────────────────────┤
│  8 │  -2.26 │ extraordinary │    2.13 │ inquiries          │
├────┼────────┼───────────────┼─────────┼────────────────────┤
│  9 │  -2.22 │ masterpiece   │    2.22 │ details ultimately │
╘════╧════════╧═══════════════╧═════════╧════════════════════╛
============ Sentiment Score:  3
╒════╤════════╤═════════════════════╤═════════╤════════════════════╕
│    │   Most │ Likely              │   Least │ Likely             │
╞════╪════════╪═════════════════════╪═════════╪════════════════════╡
│  0 │  -2.42 │ devoid              │    1.97 │ enjoyment          │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  1 │  -2.00 │ film offers         │    1.98 │ characters inhabit │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  2 │  -1.96 │ rrb film            │    2.00 │ movie looking      │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  3 │  -1.93 │ zings               │    2.02 │ variety tones      │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  4 │  -1.89 │ unfulfilling        │    2.07 │ stimulating        │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  5 │  -1.89 │ unless              │    2.12 │ like lead          │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  6 │  -1.86 │ heartfelt hilarious │    2.14 │ film ambitious     │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  7 │  -1.86 │ amusing tender      │    2.22 │ positive           │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  8 │  -1.85 │ delightful witty    │    2.47 │ hard resist        │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  9 │  -1.85 │ irresistible blend  │    2.50 │ half bad           │
╘════╧════════╧═════════════════════╧═════════╧════════════════════╛
============ Sentiment Score:  4
╒════╤════════╤══════════════════════════╤═════════╤════════════════╕
│    │   Most │ Likely                   │   Least │ Likely         │
╞════╪════════╪══════════════════════════╪═════════╪════════════════╡
│  0 │  -1.50 │ devastating experience   │    2.07 │ breathtakingly │
├────┼────────┼──────────────────────────┼─────────┼────────────────┤
│  1 │  -1.42 │ sickly                   │    2.11 │ riveted        │
├────┼────────┼──────────────────────────┼─────────┼────────────────┤
│  2 │  -1.38 │ unless                   │    2.13 │ superbly       │
├────┼────────┼──────────────────────────┼─────────┼────────────────┤
│  3 │  -1.27 │ vividly vividly          │    2.16 │ dazzling       │
├────┼────────┼──────────────────────────┼─────────┼────────────────┤
│  4 │  -1.25 │ remarkably unpretentious │    2.17 │ brilliant      │
├────┼────────┼──────────────────────────┼─────────┼────────────────┤
│  5 │  -1.24 │ make worth               │    2.18 │ exquisite      │
├────┼────────┼──────────────────────────┼─────────┼────────────────┤
│  6 │  -1.23 │ uneven                   │    2.21 │ perfection     │
├────┼────────┼──────────────────────────┼─────────┼────────────────┤
│  7 │  -1.21 │ entertainment comes      │    2.21 │ splendid       │
├────┼────────┼──────────────────────────┼─────────┼────────────────┤
│  8 │  -1.19 │ border                   │    2.24 │ beautifully    │
├────┼────────┼──────────────────────────┼─────────┼────────────────┤
│  9 │  -1.17 │ girls swim               │    2.53 │ masterpiece    │
╘════╧════════╧══════════════════════════╧═════════╧════════════════╛

vec = bigram_tv_v4
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤═════════════════╤═════════╤════════════════╕
│    │   Most │ Likely          │   Least │ Likely         │
╞════╪════════╪═════════════════╪═════════╪════════════════╡
│  0 │  -1.47 │ stupid stupid   │    1.83 │ disaster       │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  1 │  -1.14 │ movie does      │    1.83 │ unmemorable    │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  2 │  -1.13 │ worse worse     │    1.87 │ awful          │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  3 │  -1.06 │ plain boring    │    1.87 │ unappealing    │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  4 │  -1.06 │ fails fails     │    1.89 │ premise just   │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  5 │  -1.03 │ waste waste     │    1.97 │ worst          │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  6 │  -0.98 │ comedy year     │    1.97 │ unwatchable    │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  7 │  -0.96 │ badly badly     │    1.97 │ ugly           │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  8 │  -0.95 │ tedious tedious │    2.12 │ pathetic       │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  9 │  -0.93 │ zero zero       │    2.30 │ disappointment │
╘════╧════════╧═════════════════╧═════════╧════════════════╛
============ Sentiment Score:  1
╒════╤════════╤═════════════════════╤═════════╤════════════════════╕
│    │   Most │ Likely              │   Least │ Likely             │
╞════╪════════╪═════════════════════╪═════════╪════════════════════╡
│  0 │  -1.54 │ pretentious mess    │    1.82 │ funny entertaining │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  1 │  -1.49 │ hard stop           │    1.84 │ fails              │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  2 │  -1.41 │ film barely         │    1.88 │ sadly              │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  3 │  -1.37 │ lacks substance     │    1.88 │ mushy              │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  4 │  -1.35 │ dull tagline        │    1.90 │ foul               │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  5 │  -1.31 │ powerful            │    1.95 │ bland              │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  6 │  -1.30 │ like movie          │    2.01 │ ridiculous         │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  7 │  -1.29 │ unnecessary retread │    2.09 │ lacking            │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  8 │  -1.28 │ does rely           │    2.23 │ lack               │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  9 │  -1.28 │ care really         │    2.75 │ lacks              │
╘════╧════════╧═════════════════════╧═════════╧════════════════════╛
============ Sentiment Score:  2
╒════╤════════╤═════════════╤═════════╤═══════════════════════════╕
│    │   Most │ Likely      │   Least │ Likely                    │
╞════╪════════╪═════════════╪═════════╪═══════════════════════════╡
│  0 │  -2.88 │ perfect     │    1.57 │ really really             │
├────┼────────┼─────────────┼─────────┼───────────────────────────┤
│  1 │  -2.81 │ best        │    1.58 │ suspense suspense         │
├────┼────────┼─────────────┼─────────┼───────────────────────────┤
│  2 │  -2.76 │ brilliant   │    1.58 │ dialogue dialogue         │
├────┼────────┼─────────────┼─────────┼───────────────────────────┤
│  3 │  -2.75 │ beautiful   │    1.60 │ good cheesy               │
├────┼────────┼─────────────┼─────────┼───────────────────────────┤
│  4 │  -2.74 │ good        │    1.64 │ surprising surprising     │
├────┼────────┼─────────────┼─────────┼───────────────────────────┤
│  5 │  -2.69 │ beautifully │    1.69 │ rrb lrb                   │
├────┼────────┼─────────────┼─────────┼───────────────────────────┤
│  6 │  -2.65 │ delightful  │    1.72 │ smarter smarter           │
├────┼────────┼─────────────┼─────────┼───────────────────────────┤
│  7 │  -2.63 │ hilarious   │    1.75 │ performances performances │
├────┼────────┼─────────────┼─────────┼───────────────────────────┤
│  8 │  -2.60 │ funny       │    1.87 │ budget movie              │
├────┼────────┼─────────────┼─────────┼───────────────────────────┤
│  9 │  -2.58 │ great       │    1.92 │ enjoy mindless            │
╘════╧════════╧═════════════╧═════════╧═══════════════════════════╛
============ Sentiment Score:  3
╒════╤════════╤═════════════════════╤═════════╤═════════════╕
│    │   Most │ Likely              │   Least │ Likely      │
╞════╪════════╪═════════════════════╪═════════╪═════════════╡
│  0 │  -1.90 │ film offers         │    1.87 │ interesting │
├────┼────────┼─────────────────────┼─────────┼─────────────┤
│  1 │  -1.85 │ lacking             │    1.88 │ bittersweet │
├────┼────────┼─────────────────────┼─────────┼─────────────┤
│  2 │  -1.74 │ devoid              │    1.90 │ enjoy       │
├────┼────────┼─────────────────────┼─────────┼─────────────┤
│  3 │  -1.67 │ lacks               │    1.96 │ enjoyment   │
├────┼────────┼─────────────────────┼─────────┼─────────────┤
│  4 │  -1.58 │ loses               │    1.99 │ appealing   │
├────┼────────┼─────────────────────┼─────────┼─────────────┤
│  5 │  -1.56 │ story fascinating   │    2.01 │ hard resist │
├────┼────────┼─────────────────────┼─────────┼─────────────┤
│  6 │  -1.52 │ delightful witty    │    2.04 │ decent      │
├────┼────────┼─────────────────────┼─────────┼─────────────┤
│  7 │  -1.49 │ amusing tender      │    2.07 │ positive    │
├────┼────────┼─────────────────────┼─────────┼─────────────┤
│  8 │  -1.48 │ earnest earnest     │    2.12 │ pleasant    │
├────┼────────┼─────────────────────┼─────────┼─────────────┤
│  9 │  -1.47 │ heartfelt hilarious │    2.30 │ good        │
╘════╧════════╧═════════════════════╧═════════╧═════════════╛
============ Sentiment Score:  4
╒════╤════════╤═══════════════════════════╤═════════╤═══════════════╕
│    │   Most │ Likely                    │   Least │ Likely        │
╞════╪════════╪═══════════════════════════╪═════════╪═══════════════╡
│  0 │  -1.50 │ excellent film            │    2.09 │ magnificent   │
├────┼────────┼───────────────────────────┼─────────┼───────────────┤
│  1 │  -1.48 │ moving moving             │    2.09 │ wonderful     │
├────┼────────┼───────────────────────────┼─────────┼───────────────┤
│  2 │  -1.17 │ best best                 │    2.16 │ extraordinary │
├────┼────────┼───────────────────────────┼─────────┼───────────────┤
│  3 │  -1.16 │ entertainment comes       │    2.18 │ dazzling      │
├────┼────────┼───────────────────────────┼─────────┼───────────────┤
│  4 │  -1.13 │ performances performances │    2.19 │ best          │
├────┼────────┼───────────────────────────┼─────────┼───────────────┤
│  5 │  -1.12 │ devastating experience    │    2.27 │ beautifully   │
├────┼────────┼───────────────────────────┼─────────┼───────────────┤
│  6 │  -1.10 │ stunning stunning         │    2.30 │ stunning      │
├────┼────────┼───────────────────────────┼─────────┼───────────────┤
│  7 │  -1.10 │ fascinating fascinating   │    2.32 │ terrific      │
├────┼────────┼───────────────────────────┼─────────┼───────────────┤
│  8 │  -1.08 │ intelligent intelligent   │    2.41 │ masterpiece   │
├────┼────────┼───────────────────────────┼─────────┼───────────────┤
│  9 │  -1.04 │ bond solid                │    2.48 │ brilliant     │
╘════╧════════╧═══════════════════════════╧═════════╧═══════════════╛

vec = bigram_tv_v5
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

============ Sentiment Score:  0
╒════╤════════╤════════════════════════╤═════════╤════════════════╕
│    │   Most │ Likely                 │   Least │ Likely         │
╞════╪════════╪════════════════════════╪═════════╪════════════════╡
│  0 │  -1.30 │ plain boring           │    1.94 │ repugnant      │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  1 │  -1.27 │ stupid stupid          │    1.95 │ unmemorable    │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  2 │  -1.19 │ movie does             │    1.99 │ pathetic       │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  3 │  -1.09 │ badly badly            │    2.03 │ disgusting     │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  4 │  -1.07 │ comedy year            │    2.06 │ worst          │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  5 │  -1.06 │ incompetent incoherent │    2.06 │ premise just   │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  6 │  -1.03 │ fails fails            │    2.07 │ unwatchable    │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  7 │  -1.01 │ movie horrible         │    2.08 │ awful          │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  8 │  -1.00 │ works minutes          │    2.08 │ unappealing    │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  9 │  -0.99 │ worse worse            │    2.60 │ disappointment │
╘════╧════════╧════════════════════════╧═════════╧════════════════╛
============ Sentiment Score:  1
╒════╤════════╤═════════════════════╤═════════╤════════════════════╕
│    │   Most │ Likely              │   Least │ Likely             │
╞════╪════════╪═════════════════════╪═════════╪════════════════════╡
│  0 │  -1.87 │ lacks substance     │    1.91 │ loses              │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  1 │  -1.78 │ pretentious mess    │    1.93 │ lack               │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  2 │  -1.78 │ hard stop           │    1.93 │ absurdity          │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  3 │  -1.57 │ contrived overblown │    1.96 │ ridiculous         │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  4 │  -1.52 │ characters team     │    1.96 │ foul               │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  5 │  -1.50 │ powerful            │    1.98 │ special final      │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  6 │  -1.45 │ dull tagline        │    2.07 │ funny entertaining │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  7 │  -1.44 │ does disgrace       │    2.12 │ sadly              │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  8 │  -1.42 │ acted poorly        │    2.27 │ lacking            │
├────┼────────┼─────────────────────┼─────────┼────────────────────┤
│  9 │  -1.42 │ justice awfulness   │    2.77 │ lacks              │
╘════╧════════╧═════════════════════╧═════════╧════════════════════╛
============ Sentiment Score:  2
╒════╤════════╤═════════════╤═════════╤═══════════════════╕
│    │   Most │ Likely      │   Least │ Likely            │
╞════╪════════╪═════════════╪═════════╪═══════════════════╡
│  0 │  -2.78 │ beautiful   │    1.70 │ rrb lrb           │
├────┼────────┼─────────────┼─────────┼───────────────────┤
│  1 │  -2.76 │ perfect     │    1.75 │ funny beautifully │
├────┼────────┼─────────────┼─────────┼───────────────────┤
│  2 │  -2.71 │ delightful  │    1.77 │ watchable hardly  │
├────┼────────┼─────────────┼─────────┼───────────────────┤
│  3 │  -2.67 │ beautifully │    1.79 │ funny laugh       │
├────┼────────┼─────────────┼─────────┼───────────────────┤
│  4 │  -2.61 │ brilliant   │    1.81 │ good cheesy       │
├────┼────────┼─────────────┼─────────┼───────────────────┤
│  5 │  -2.60 │ best        │    1.85 │ smarter smarter   │
├────┼────────┼─────────────┼─────────┼───────────────────┤
│  6 │  -2.58 │ remarkable  │    1.86 │ budget movie      │
├────┼────────┼─────────────┼─────────┼───────────────────┤
│  7 │  -2.57 │ terrific    │    1.90 │ morally superior  │
├────┼────────┼─────────────┼─────────┼───────────────────┤
│  8 │  -2.55 │ good        │    1.96 │ avoid fatal       │
├────┼────────┼─────────────┼─────────┼───────────────────┤
│  9 │  -2.53 │ great       │    1.96 │ enjoy mindless    │
╘════╧════════╧═════════════╧═════════╧═══════════════════╛
============ Sentiment Score:  3
╒════╤════════╤═════════════════════╤═════════╤═══════════════╕
│    │   Most │ Likely              │   Least │ Likely        │
╞════╪════════╪═════════════════════╪═════════╪═══════════════╡
│  0 │  -2.30 │ devoid              │    1.90 │ charm         │
├────┼────────┼─────────────────────┼─────────┼───────────────┤
│  1 │  -2.17 │ film offers         │    1.94 │ decent        │
├────┼────────┼─────────────────────┼─────────┼───────────────┤
│  2 │  -1.93 │ lacking             │    1.94 │ confidence    │
├────┼────────┼─────────────────────┼─────────┼───────────────┤
│  3 │  -1.86 │ loses               │    1.95 │ method fails  │
├────┼────────┼─────────────────────┼─────────┼───────────────┤
│  4 │  -1.76 │ lacks               │    1.95 │ movie looking │
├────┼────────┼─────────────────────┼─────────┼───────────────┤
│  5 │  -1.76 │ amusing tender      │    2.02 │ hard resist   │
├────┼────────┼─────────────────────┼─────────┼───────────────┤
│  6 │  -1.76 │ heartfelt hilarious │    2.04 │ good          │
├────┼────────┼─────────────────────┼─────────┼───────────────┤
│  7 │  -1.70 │ reminds animation   │    2.08 │ enjoyment     │
├────┼────────┼─────────────────────┼─────────┼───────────────┤
│  8 │  -1.63 │ seagal sharp        │    2.12 │ positive      │
├────┼────────┼─────────────────────┼─────────┼───────────────┤
│  9 │  -1.62 │ story fascinating   │    2.27 │ half bad      │
╘════╧════════╧═════════════════════╧═════════╧═══════════════╛
============ Sentiment Score:  4
╒════╤════════╤════════════════════════╤═════════╤═══════════════╕
│    │   Most │ Likely                 │   Least │ Likely        │
╞════╪════════╪════════════════════════╪═════════╪═══════════════╡
│  0 │  -1.56 │ excellent film         │    2.16 │ exquisite     │
├────┼────────┼────────────────────────┼─────────┼───────────────┤
│  1 │  -1.36 │ moving moving          │    2.16 │ magnificent   │
├────┼────────┼────────────────────────┼─────────┼───────────────┤
│  2 │  -1.28 │ uneven                 │    2.17 │ beautifully   │
├────┼────────┼────────────────────────┼─────────┼───────────────┤
│  3 │  -1.27 │ devastating experience │    2.17 │ terrific      │
├────┼────────┼────────────────────────┼─────────┼───────────────┤
│  4 │  -1.24 │ entertainment comes    │    2.19 │ excellent     │
├────┼────────┼────────────────────────┼─────────┼───────────────┤
│  5 │  -1.21 │ turns gripping         │    2.30 │ masterfully   │
├────┼────────┼────────────────────────┼─────────┼───────────────┤
│  6 │  -1.20 │ films thoughtful       │    2.30 │ brilliant     │
├────┼────────┼────────────────────────┼─────────┼───────────────┤
│  7 │  -1.16 │ great fiery            │    2.30 │ splendid      │
├────┼────────┼────────────────────────┼─────────┼───────────────┤
│  8 │  -1.12 │ make worth             │    2.34 │ extraordinary │
├────┼────────┼────────────────────────┼─────────┼───────────────┤
│  9 │  -1.11 │ story sweet            │    2.59 │ masterpiece   │
╘════╧════════╧════════════════════════╧═════════╧═══════════════╛

# 	classifier	vectorizer	score
# 0	mnb	V1	0.583317
# 1	mnb	V1	0.594499
# 2	svm	V1	0.629566
# 3	svm	V1	0.636662

TEST 1 -- MNB & SVM with Vectorizer 1¶

# vec = unigram_bool_cv_v1
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
# df

# vec = unigram_bool_cv_v1
# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
# df

NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not.

TEST 2 -- MNB & SVM with Vectorizer 2¶

# vec = unigram_bool_cv_v2
# classifier = mnb


# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
# df

# vec = unigram_bool_cv_v2
# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
# df

TEST 3 -- MNB & SVM with Vectorizer 3¶

# vec = unigram_cv
# classifier = mnb


# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
# df

# vec = unigram_cv
# classifier = svm


# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
# df

TEST 4 -- MNB & SVM with Vectorizer 4¶

# vec = bigram_cv
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
# df

df

[]

TEST 5 -- MNB & SVM with Vectorizer 5¶

vec = bigram_cv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})

classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤═══════════════════╤═════════╤════════════╕
│    │   Most │ Likely            │   Least │ Likely     │
╞════╪════════╪═══════════════════╪═════════╪════════════╡
│  0 │ -11.16 │ aaliyah           │   -6.62 │ time       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  1 │ -11.16 │ abagnale          │   -6.61 │ characters │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  2 │ -11.16 │ abagnale antics   │   -6.60 │ minutes    │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  3 │ -11.16 │ abandon political │   -6.60 │ story      │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  4 │ -11.16 │ abandoned         │   -6.59 │ comedy     │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  5 │ -11.16 │ abbreviated       │   -6.37 │ just       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  6 │ -11.16 │ abel              │   -5.82 │ like       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  7 │ -11.16 │ abel ferrara      │   -5.65 │ bad        │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  8 │ -11.16 │ abhors            │   -5.51 │ film       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  9 │ -11.16 │ abiding           │   -5.00 │ movie      │
╘════╧════════╧═══════════════════╧═════════╧════════════╛
============ Sentiment Score:  1
╒════╤════════╤═══════════════════╤═════════╤══════════╕
│    │   Most │ Likely            │   Least │ Likely   │
╞════╪════════╪═══════════════════╪═════════╪══════════╡
│  0 │ -11.87 │ abagnale          │   -6.27 │ lrb      │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  1 │ -11.87 │ abagnale antics   │   -6.23 │ bad      │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  2 │ -11.87 │ abandon political │   -6.18 │ rrb      │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  3 │ -11.87 │ abbott            │   -6.17 │ little   │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  4 │ -11.87 │ abbott ernest     │   -6.02 │ story    │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  5 │ -11.87 │ abdul             │   -5.97 │ just     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  6 │ -11.87 │ abdul malik       │   -5.96 │ does     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  7 │ -11.87 │ abel              │   -5.57 │ like     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  8 │ -11.87 │ abel ferrara      │   -5.22 │ film     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  9 │ -11.87 │ abilities         │   -5.09 │ movie    │
╘════╧════════╧═══════════════════╧═════════╧══════════╛
============ Sentiment Score:  2
╒════╤════════╤════════════════════════╤═════════╤════════════╕
│    │   Most │ Likely                 │   Least │ Likely     │
╞════╪════════╪════════════════════════╪═════════╪════════════╡
│  0 │ -12.29 │ abandon theater        │   -6.39 │ movies     │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  1 │ -12.29 │ ability shock          │   -6.36 │ characters │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  2 │ -12.29 │ ability think          │   -6.24 │ life       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  3 │ -12.29 │ able performances      │   -6.24 │ time       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  4 │ -12.29 │ able project           │   -6.02 │ lrb        │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  5 │ -12.29 │ abroad                 │   -5.94 │ story      │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  6 │ -12.29 │ absolutely earned      │   -5.77 │ rrb        │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  7 │ -12.29 │ absolutely inescapably │   -5.74 │ like       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  8 │ -12.29 │ absorbing characters   │   -5.19 │ movie      │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  9 │ -12.29 │ absorbing look         │   -5.14 │ film       │
╘════╧════════╧════════════════════════╧═════════╧════════════╛
============ Sentiment Score:  3
╒════╤════════╤═════════════════╤═════════╤══════════╕
│    │   Most │ Likely          │   Least │ Likely   │
╞════╪════════╪═════════════════╪═════════╪══════════╡
│  0 │ -11.99 │ aaliyah         │   -6.26 │ love     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  1 │ -11.99 │ abandon theater │   -6.26 │ lrb      │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  2 │ -11.99 │ abbreviated     │   -6.18 │ life     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  3 │ -11.99 │ abc             │   -6.18 │ rrb      │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  4 │ -11.99 │ abhorrent       │   -6.07 │ like     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  5 │ -11.99 │ abhors          │   -6.01 │ funny    │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  6 │ -11.99 │ ability shock   │   -6.01 │ story    │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  7 │ -11.99 │ able accomplish │   -5.61 │ good     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  8 │ -11.99 │ able better     │   -5.29 │ movie    │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  9 │ -11.99 │ able project    │   -4.99 │ film     │
╘════╧════════╧═════════════════╧═════════╧══════════╛
============ Sentiment Score:  4
╒════╤════════╤═════════════════╤═════════╤══════════════╕
│    │   Most │ Likely          │   Least │ Likely       │
╞════╪════════╪═════════════════╪═════════╪══════════════╡
│  0 │ -11.27 │ aaliyah         │   -6.46 │ performance  │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  1 │ -11.27 │ abagnale        │   -6.41 │ comedy       │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  2 │ -11.27 │ abagnale antics │   -6.36 │ great        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  3 │ -11.27 │ abandon scripts │   -6.34 │ story        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  4 │ -11.27 │ abandon theater │   -6.29 │ performances │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  5 │ -11.27 │ abandoned       │   -6.01 │ good         │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  6 │ -11.27 │ abbott          │   -5.87 │ funny        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  7 │ -11.27 │ abbott ernest   │   -5.78 │ best         │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  8 │ -11.27 │ abbreviated     │   -5.40 │ movie        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  9 │ -11.27 │ abc             │   -4.89 │ film         │
╘════╧════════╧═════════════════╧═════════╧══════════════╛

---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-43-64413d29c6fd> in <module>
      5 model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
      6 return_features(vec, model)
----> 7 df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})
      8 
      9 classifier = svm

NameError: name 'big_df' is not defined

df

TEST 6 -- MNB & SVM with Vectorizer 6¶

# vec = unigram_tv
# classifier = mnb


# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})

df

TEST 7 -- MNB & SVM with Vectorizer 7¶

# vec = unigram_tv_v2
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})

df

TEST 8 -- MNB & SVM with Vectorizer 8¶

# vec = bigram_tv
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})

df

TEST 9 -- MNB & SVM with Vectorizer 9¶

# vec = bigram_tv_v2
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})

df

train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

# pred_vec = bigram_tv_v3 # 60.4
# pred_vec = bigram_tv_v4 # 60.569
pred_vec = bigram_tv_v4 # removing words < 2 60.584


test = pd.read_csv("kaggle-sentiment/test.tsv", delimiter='\t')
k_id = test['PhraseId']
k_text = test['Phrase']

# k_vec = bigram_tv_v3.transform(k_text)
# k_vec

def get_kaggle_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
#     X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, y_train,

def do_the_kaggle(X,y,vec):
    X_train_vec, y_train = get_kaggle_test_train_vec(X,y,vec)
    svm_clf = LinearSVC(C=1)
    k_vec = pred_vec.transform(k_text)
    print(len(X), X_train_vec.shape, k_vec.shape)

    prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)
    kaggle_submission = zip(k_id, prediction)
    outf=open('kaggle_submission_linearSVC_v10.csv', 'w')
    outf.write('PhraseId,Sentiment\n')
    for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')
    outf.close()
    print('prediction complete')

do_the_kaggle(X,y,pred_vec)

156060 (117045, 78021) (66292, 78021)
prediction complete

	0	labels	tokens	num_tokens	sentences	num_sentences	no_sw	num_no_sw	topwords_unfil	topwords_fil	...	v_pos_fd	bow	bow_nosw	diy_cleaner	pruned	nltk_negs	unigram_feats	bigram_feats	bigram_feats_neg	pruned_2
0	A series of escapades demonstrating the adage ...	1	['a', 'series', 'of', 'escapades', 'demonstrat...	35	['A series of escapades demonstrating the adag...	1	['series', 'escapades', 'demonstrating', 'adag...	15	[('of', 4), ('the', 3), ('a', 2), ('is', 2), (...	[('good', 2), ('series', 1), ('escapades', 1),...	...	0.307	Counter({'of': 4, 'the': 3, 'a': 2, 'is': 2, '...	Counter({'good': 2, 'series': 1, 'escapades': ...	a series of escapades demonstrating the adage ...	series escapades demonstrating adage that what...	['a', 'series', 'of', 'escapades', 'demonstrat...	['the', 'of', 'is', 'good', 'for', 'of_NEG', '...	['a_series', 'series_of', 'of_escapades', 'esc...	['a_series', 'series_of', 'of_escapades', 'esc...	series escapades demonstrating adage that what...
1	A series of escapades demonstrating the adage ...	2	['a', 'series', 'of', 'escapades', 'demonstrat...	14	['A series of escapades demonstrating the adag...	1	['series', 'escapades', 'demonstrating', 'adag...	6	[('the', 2), ('a', 1), ('series', 1), ('of', 1...	[('series', 1), ('escapades', 1), ('demonstrat...	...	0.367	Counter({'the': 2, 'a': 1, 'series': 1, 'of': ...	Counter({'series': 1, 'escapades': 1, 'demonst...	a series of escapades demonstrating the adage ...	series escapades demonstrating adage that what...	['a', 'series', 'of', 'escapades', 'demonstrat...	['the', 'a', 'series', 'of', 'escapades', 'dem...	['a_series', 'series_of', 'of_escapades', 'esc...	['a_series', 'series_of', 'of_escapades', 'esc...	series escapades demonstrating adage that what...
2	A series	2	['a', 'series']	2	['A series']	1	['series']	1	[('a', 1), ('series', 1)]	[('series', 1)]	...	0.000	Counter({'a': 1, 'series': 1})	Counter({'series': 1})	a series a series a series	series series series	['a', 'series']	['a', 'series']	['a_series']	['a_series']	series series series
3	A	2	['a']	1	['A']	1	[]	0	[('a', 1)]	[]	...	0.000	Counter({'a': 1})	Counter()	a a a	NaN	['a']	['a']	[]	[]	empty
4	series	2	['series']	1	['series']	1	['series']	1	[('series', 1)]	[('series', 1)]	...	0.000	Counter({'series': 1})	Counter({'series': 1})	series series series	series series series	['series']	['series']	[]	[]	series series series

	classifier	vectorizer	score
0	mnb	V1	0.579779
1	mnb	V1	0.593444
2	svm	V1	0.628302
3	svm	V1	0.629617
4	svm	V1	0.630428