HW7: Comparing MNB & SVM with Kaggle Sentiment Data¶

OVERVIEW¶

VECTORIZERS USED:¶

CountVectorizer
TfidfVectorizer

MODELS USED:¶

Multinomial Naive Bayes (MNB)
Support Vector Machines (SVM)

VECTORIZATION PARAMS:¶

Binary
Stopwords
Unigrams, Bigrams
Min & Max df

TODO:¶

Stemming?
Vadar + TextBlob

FUNCTION & PACKAGE PARTY¶

## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
    clf = classifier
    clf.fit(X_train_vec,y_train)
    y_pred = clf.predict(X_test_vec)
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    score = clf.score(X_test_vec,y_test)
    return clf, score, report
    
def get_model(X, y, labels, target_names, classifier, vec):
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
    return model, score, report
    
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd

def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

def update_big_df(big_df, new_row):
    big_df.append(new_row)
    df = pd.DataFrame(big_df)
    df = df.drop_duplicates()
    return df

DATA GOES HERE:¶

# import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

TASK 1¶

TEST 1 -- MNB & SVM with Vectorizer 1¶

big_df = []

vec = unigram_bool_cv_v1
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df

vec = unigram_bool_cv_v1
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not.

TEST 2 -- MNB & SVM with Vectorizer 2¶

vec = unigram_bool_cv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
df
# return_features(vec, model)

vec = unigram_bool_cv_v2
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
df

TEST 3 -- MNB & SVM with Vectorizer 3¶

vec = unigram_cv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
df

vec = unigram_cv
classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
df

TEST 4 -- MNB & SVM with Vectorizer 4¶

vec = bigram_cv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})

vec = bigram_cv
classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})

/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)

df

TEST 5 -- MNB & SVM with Vectorizer 5¶

vec = bigram_cv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})

/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)

df

TEST 6 -- MNB & SVM with Vectorizer 6¶

vec = unigram_tv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})

df

TEST 7 -- MNB & SVM with Vectorizer 7¶

vec = unigram_tv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})

df

TEST 8 -- MNB & SVM with Vectorizer 8¶

vec = bigram_tv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})

df

TEST 9 -- MNB & SVM with Vectorizer 9¶

vec = bigram_tv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})

============ Sentiment Score:  0
╒════╤════════╤═══════════════════╤═════════╤══════════╕
│    │   Most │ Likely            │   Least │ Likely   │
╞════╪════════╪═══════════════════╪═════════╪══════════╡
│  0 │ -10.72 │ aaliyah           │   -7.65 │ stupid   │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  1 │ -10.72 │ abagnale          │   -7.62 │ mess     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  2 │ -10.72 │ abagnale antics   │   -7.55 │ minutes  │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  3 │ -10.72 │ abandon political │   -7.49 │ dull     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  4 │ -10.72 │ abandoned         │   -7.38 │ just     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  5 │ -10.72 │ abbreviated       │   -7.19 │ worst    │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  6 │ -10.72 │ abel              │   -7.09 │ like     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  7 │ -10.72 │ abel ferrara      │   -6.84 │ film     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  8 │ -10.72 │ abhors            │   -6.46 │ bad      │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  9 │ -10.72 │ abiding           │   -6.23 │ movie    │
╘════╧════════╧═══════════════════╧═════════╧══════════╛
============ Sentiment Score:  1
╒════╤════════╤═══════════════════╤═════════╤════════════╕
│    │   Most │ Likely            │   Least │ Likely     │
╞════╪════════╪═══════════════════╪═════════╪════════════╡
│  0 │ -11.18 │ abagnale          │   -7.03 │ way        │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  1 │ -11.18 │ abagnale antics   │   -6.97 │ characters │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  2 │ -11.18 │ abandon political │   -6.80 │ story      │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  3 │ -11.18 │ abbott            │   -6.74 │ little     │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  4 │ -11.18 │ abbott ernest     │   -6.72 │ just       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  5 │ -11.18 │ abdul             │   -6.70 │ bad        │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  6 │ -11.18 │ abdul malik       │   -6.69 │ does       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  7 │ -11.18 │ abel              │   -6.56 │ like       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  8 │ -11.18 │ abel ferrara      │   -6.24 │ film       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  9 │ -11.18 │ abilities         │   -6.00 │ movie      │
╘════╧════════╧═══════════════════╧═════════╧════════════╛
============ Sentiment Score:  2
╒════╤════════╤════════════════════════╤═════════╤════════════╕
│    │   Most │ Likely                 │   Least │ Likely     │
╞════╪════════╪════════════════════════╪═════════╪════════════╡
│  0 │ -11.65 │ abandon theater        │   -6.80 │ way        │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  1 │ -11.65 │ ability shock          │   -6.76 │ movies     │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  2 │ -11.65 │ ability think          │   -6.74 │ characters │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  3 │ -11.65 │ able performances      │   -6.73 │ life       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  4 │ -11.65 │ able project           │   -6.63 │ time       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  5 │ -11.65 │ abroad                 │   -6.49 │ story      │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  6 │ -11.65 │ absolutely earned      │   -6.45 │ like       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  7 │ -11.65 │ absolutely inescapably │   -6.44 │ rrb        │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  8 │ -11.65 │ absorbing characters   │   -5.80 │ film       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  9 │ -11.65 │ absorbing look         │   -5.78 │ movie      │
╘════╧════════╧════════════════════════╧═════════╧════════════╛
============ Sentiment Score:  3
╒════╤════════╤═════════════════╤═════════╤══════════╕
│    │   Most │ Likely          │   Least │ Likely   │
╞════╪════════╪═════════════════╪═════════╪══════════╡
│  0 │ -11.28 │ aaliyah         │   -6.92 │ like     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  1 │ -11.28 │ abandon theater │   -6.88 │ comedy   │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  2 │ -11.28 │ abbreviated     │   -6.85 │ best     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  3 │ -11.28 │ abc             │   -6.83 │ fun      │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  4 │ -11.28 │ abhorrent       │   -6.80 │ love     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  5 │ -11.28 │ abhors          │   -6.79 │ story    │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  6 │ -11.28 │ ability shock   │   -6.32 │ funny    │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  7 │ -11.28 │ able accomplish │   -6.11 │ movie    │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  8 │ -11.28 │ able better     │   -6.11 │ good     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  9 │ -11.28 │ able project    │   -5.92 │ film     │
╘════╧════════╧═════════════════╧═════════╧══════════╛
============ Sentiment Score:  4
╒════╤════════╤═════════════════╤═════════╤══════════════╕
│    │   Most │ Likely          │   Least │ Likely       │
╞════╪════════╪═════════════════╪═════════╪══════════════╡
│  0 │ -10.79 │ aaliyah         │   -7.23 │ fun          │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  1 │ -10.79 │ abagnale        │   -7.07 │ entertaining │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  2 │ -10.79 │ abagnale antics │   -7.02 │ performance  │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  3 │ -10.79 │ abandon scripts │   -6.97 │ great        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  4 │ -10.79 │ abandon theater │   -6.95 │ performances │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  5 │ -10.79 │ abandoned       │   -6.81 │ good         │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  6 │ -10.79 │ abbott          │   -6.65 │ funny        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  7 │ -10.79 │ abbott ernest   │   -6.50 │ movie        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  8 │ -10.79 │ abbreviated     │   -6.48 │ best         │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  9 │ -10.79 │ abc             │   -6.06 │ film         │
╘════╧════════╧═════════════════╧═════════╧══════════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════════╤═════════╤══════════════════╕
│    │   Most │ Likely       │   Least │ Likely           │
╞════╪════════╪══════════════╪═════════╪══════════════════╡
│  0 │  -1.32 │ good good    │    2.09 │ awful            │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  1 │  -1.29 │ variation    │    2.11 │ unwatchable      │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  2 │  -1.10 │ just like    │    2.13 │ unbearable       │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  3 │  -1.08 │ going really │    2.14 │ entirely witless │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  4 │  -1.07 │ man garbage  │    2.17 │ distasteful      │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  5 │  -1.07 │ lightness    │    2.27 │ disgusting       │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  6 │  -1.06 │ awful lot    │    2.31 │ garbage          │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  7 │  -1.05 │ appear       │    2.33 │ charm laughs     │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  8 │  -1.04 │ loving       │    2.45 │ waste            │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  9 │  -1.00 │ movie way    │    2.68 │ disappointment   │
╘════╧════════╧══════════════╧═════════╧══════════════════╛
============ Sentiment Score:  1
╒════╤════════╤═════════════════════════╤═════════╤══════════════════╕
│    │   Most │ Likely                  │   Least │ Likely           │
╞════╪════════╪═════════════════════════╪═════════╪══════════════════╡
│  0 │  -2.21 │ wo feel                 │    2.14 │ delivered mr     │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  1 │  -2.01 │ unlikable uninteresting │    2.17 │ sadly            │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  2 │  -1.76 │ way does                │    2.19 │ want think       │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  3 │  -1.76 │ contrived overblown     │    2.20 │ overbearing      │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  4 │  -1.64 │ justice awfulness       │    2.21 │ padded           │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  5 │  -1.64 │ willing claustrophobic  │    2.25 │ muddy            │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  6 │  -1.60 │ whiny pathetic          │    2.26 │ does live        │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  7 │  -1.58 │ uniquely                │    2.37 │ note performance │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  8 │  -1.55 │ badly hard              │    2.41 │ lacks            │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  9 │  -1.52 │ fascinating wo          │    2.48 │ squanders        │
╘════╧════════╧═════════════════════════╧═════════╧══════════════════╛
============ Sentiment Score:  2
╒════╤════════╤══════════════╤═════════╤════════════════════════╕
│    │   Most │ Likely       │   Least │ Likely                 │
╞════╪════════╪══════════════╪═════════╪════════════════════════╡
│  0 │  -2.79 │ remarkable   │    1.62 │ redeeming features     │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  1 │  -2.74 │ perfect      │    1.64 │ dramatic constructs    │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  2 │  -2.74 │ beautifully  │    1.64 │ oscar make             │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  3 │  -2.64 │ delightful   │    1.67 │ age film               │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  4 │  -2.58 │ terrific     │    1.71 │ awful lot              │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  5 │  -2.50 │ stunning     │    1.75 │ cunning                │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  6 │  -2.37 │ hilarious    │    1.79 │ willing claustrophobic │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  7 │  -2.37 │ magnificent  │    1.93 │ ludicrous cult         │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  8 │  -2.33 │ worthwhile   │    2.08 │ budget movie           │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  9 │  -2.29 │ disappointed │    2.19 │ enjoy mindless         │
╘════╧════════╧══════════════╧═════════╧════════════════════════╛
============ Sentiment Score:  3
╒════╤════════╤════════════════════╤═════════╤═══════════════════╕
│    │   Most │ Likely             │   Least │ Likely            │
╞════╪════════╪════════════════════╪═════════╪═══════════════════╡
│  0 │  -2.12 │ energetic original │    1.97 │ thanks presence   │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  1 │  -2.09 │ lacks              │    1.99 │ best case         │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  2 │  -2.02 │ intelligent life   │    2.01 │ lives count       │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  3 │  -1.98 │ loses              │    2.05 │ heartening        │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  4 │  -1.93 │ wanting mention    │    2.07 │ realistically     │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  5 │  -1.81 │ zings              │    2.10 │ little film       │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  6 │  -1.80 │ mid                │    2.10 │ wo tapping        │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  7 │  -1.76 │ art direction      │    2.14 │ larger life       │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  8 │  -1.73 │ lacking            │    2.18 │ hard resist       │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  9 │  -1.70 │ canny crowd        │    2.18 │ far disappointing │
╘════╧════════╧════════════════════╧═════════╧═══════════════════╛
============ Sentiment Score:  4
╒════╤════════╤════════════════╤═════════╤═════════════╕
│    │   Most │ Likely         │   Least │ Likely      │
╞════╪════════╪════════════════╪═════════╪═════════════╡
│  0 │  -1.62 │ thanks actors  │    2.15 │ masterful   │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  1 │  -1.42 │ argue          │    2.15 │ best war    │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  2 │  -1.28 │ real star      │    2.22 │ brilliant   │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  3 │  -1.22 │ naipaul        │    2.24 │ stunning    │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  4 │  -1.17 │ lovely amazing │    2.29 │ masterfully │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  5 │  -1.17 │ convinced      │    2.36 │ magnificent │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  6 │  -1.13 │ huge cut       │    2.40 │ perfection  │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  7 │  -1.12 │ bore           │    2.41 │ zings       │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  8 │  -1.12 │ does succeed   │    2.47 │ amazing     │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  9 │  -1.11 │ say unburdened │    2.50 │ masterpiece │
╘════╧════════╧════════════════╧═════════╧═════════════╛

df

pred_vec = bigram_cv_v2

test = pd.read_csv("kaggle-sentiment/test.tsv", delimiter='\t')
k_id = test['PhraseId'].values
k_text = test['Phrase'].values

k_vec = bigram_cv_v2.transform(k_text)
k_vec

def get_kaggle_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def do_the_kaggle(X,y,vec):
    X_train_vec, X_test_vec, y_train, y_test = get_kaggle_test_train_vec(X,y,vec)
    svm_clf = LinearSVC(C=1)
    prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)
    kaggle_submission = zip(k_id, prediction)
    outf=open('kaggle_submission_linearSVC_v5.csv', 'w')
    outf.write('PhraseId,Sentiment\n')
    for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')
    outf.close()
    print('prediction complete')

do_the_kaggle(X,y,bigram_cv_v2)

prediction

/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)

df

	classifier	vectorizer	score
0	mnb	V1	0.606401
1	svm	V1	0.624183
2	mnb	V2	0.606978
3	svm	V2	0.624503
4	mnb	V3	0.606658