HW7: Comparing MNB & SVM with Kaggle Sentiment Data

OVERVIEW


VECTORIZERS USED:

CountVectorizer
TfidfVectorizer

MODELS USED:

Multinomial Naive Bayes (MNB)
Support Vector Machines (SVM)


VECTORIZATION PARAMS:

Binary
Stopwords
Unigrams, Bigrams
Min & Max df

TODO:

Stemming?
Vadar + TextBlob

FUNCTION & PACKAGE PARTY

In [58]:
## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
    clf = classifier
    clf.fit(X_train_vec,y_train)
    y_pred = clf.predict(X_test_vec)
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    score = clf.score(X_test_vec,y_test)
    return clf, score, report
    
def get_model(X, y, labels, target_names, classifier, vec):
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
    return model, score, report
    
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd

def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

def update_big_df(big_df, new_row):
    big_df.append(new_row)
    df = pd.DataFrame(big_df)
    df = df.drop_duplicates()
    return df

DATA GOES HERE:

In [36]:
# import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

TASK 1

TEST 1 -- MNB & SVM with Vectorizer 1

In [37]:
big_df = []
In [38]:
vec = unigram_bool_cv_v1
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df
Out[38]:
classifier vectorizer score
0 mnb V1 0.606401
In [39]:
vec = unigram_bool_cv_v1
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df
Out[39]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183

NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not.

TEST 2 -- MNB & SVM with Vectorizer 2

In [40]:
vec = unigram_bool_cv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
df
# return_features(vec, model)
Out[40]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
In [41]:
vec = unigram_bool_cv_v2
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
df
Out[41]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503

TEST 3 -- MNB & SVM with Vectorizer 3

In [43]:
vec = unigram_cv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
df
Out[43]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503
4 mnb V3 0.606658
In [44]:
vec = unigram_cv
classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
df
Out[44]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503
4 mnb V3 0.606658
6 svm V3 0.623815

TEST 4 -- MNB & SVM with Vectorizer 4

In [46]:
vec = bigram_cv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})

vec = bigram_cv
classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
In [47]:
df
Out[47]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503
4 mnb V3 0.606658
6 svm V3 0.623815
7 mnb V4 0.597382
8 svm V4 0.630094

TEST 5 -- MNB & SVM with Vectorizer 5

In [64]:
vec = bigram_cv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})
/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
In [50]:
df
Out[50]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503
4 mnb V3 0.606658
6 svm V3 0.623815
7 mnb V4 0.597382
8 svm V4 0.630094
9 mnb V4 0.598151
10 svm V4 0.630318

TEST 6 -- MNB & SVM with Vectorizer 6

In [ ]:
vec = unigram_tv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})
In [52]:
df
Out[52]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503
4 mnb V3 0.606658
6 svm V3 0.623815
7 mnb V4 0.597382
8 svm V4 0.630094
9 mnb V4 0.598151
10 svm V4 0.630318
11 mnb V6 0.583606
12 svm V6 0.625433

TEST 7 -- MNB & SVM with Vectorizer 7

In [56]:
vec = unigram_tv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})
In [57]:
df
Out[57]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503
4 mnb V3 0.606658
6 svm V3 0.623815
7 mnb V4 0.597382
8 svm V4 0.630094
9 mnb V4 0.598151
10 svm V4 0.630318
11 mnb V6 0.583606
12 svm V6 0.625433
13 mnb V7 0.583606
14 svm V7 0.625208

TEST 8 -- MNB & SVM with Vectorizer 8

In [59]:
vec = bigram_tv
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})
In [60]:
df
Out[60]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503
4 mnb V3 0.606658
6 svm V3 0.623815
7 mnb V4 0.597382
8 svm V4 0.630094
9 mnb V4 0.598151
10 svm V4 0.630318
11 mnb V6 0.583606
12 svm V6 0.625433
13 mnb V7 0.583606
14 svm V7 0.625208
15 mnb V8 0.594899
16 svm V8 0.630126

TEST 9 -- MNB & SVM with Vectorizer 9

In [65]:
vec = bigram_tv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})


classifier = svm


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})
============ Sentiment Score:  0
╒════╤════════╤═══════════════════╤═════════╤══════════╕
│    │   Most │ Likely            │   Least │ Likely   │
╞════╪════════╪═══════════════════╪═════════╪══════════╡
│  0 │ -10.72 │ aaliyah           │   -7.65 │ stupid   │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  1 │ -10.72 │ abagnale          │   -7.62 │ mess     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  2 │ -10.72 │ abagnale antics   │   -7.55 │ minutes  │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  3 │ -10.72 │ abandon political │   -7.49 │ dull     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  4 │ -10.72 │ abandoned         │   -7.38 │ just     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  5 │ -10.72 │ abbreviated       │   -7.19 │ worst    │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  6 │ -10.72 │ abel              │   -7.09 │ like     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  7 │ -10.72 │ abel ferrara      │   -6.84 │ film     │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  8 │ -10.72 │ abhors            │   -6.46 │ bad      │
├────┼────────┼───────────────────┼─────────┼──────────┤
│  9 │ -10.72 │ abiding           │   -6.23 │ movie    │
╘════╧════════╧═══════════════════╧═════════╧══════════╛
============ Sentiment Score:  1
╒════╤════════╤═══════════════════╤═════════╤════════════╕
│    │   Most │ Likely            │   Least │ Likely     │
╞════╪════════╪═══════════════════╪═════════╪════════════╡
│  0 │ -11.18 │ abagnale          │   -7.03 │ way        │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  1 │ -11.18 │ abagnale antics   │   -6.97 │ characters │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  2 │ -11.18 │ abandon political │   -6.80 │ story      │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  3 │ -11.18 │ abbott            │   -6.74 │ little     │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  4 │ -11.18 │ abbott ernest     │   -6.72 │ just       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  5 │ -11.18 │ abdul             │   -6.70 │ bad        │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  6 │ -11.18 │ abdul malik       │   -6.69 │ does       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  7 │ -11.18 │ abel              │   -6.56 │ like       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  8 │ -11.18 │ abel ferrara      │   -6.24 │ film       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  9 │ -11.18 │ abilities         │   -6.00 │ movie      │
╘════╧════════╧═══════════════════╧═════════╧════════════╛
============ Sentiment Score:  2
╒════╤════════╤════════════════════════╤═════════╤════════════╕
│    │   Most │ Likely                 │   Least │ Likely     │
╞════╪════════╪════════════════════════╪═════════╪════════════╡
│  0 │ -11.65 │ abandon theater        │   -6.80 │ way        │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  1 │ -11.65 │ ability shock          │   -6.76 │ movies     │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  2 │ -11.65 │ ability think          │   -6.74 │ characters │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  3 │ -11.65 │ able performances      │   -6.73 │ life       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  4 │ -11.65 │ able project           │   -6.63 │ time       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  5 │ -11.65 │ abroad                 │   -6.49 │ story      │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  6 │ -11.65 │ absolutely earned      │   -6.45 │ like       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  7 │ -11.65 │ absolutely inescapably │   -6.44 │ rrb        │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  8 │ -11.65 │ absorbing characters   │   -5.80 │ film       │
├────┼────────┼────────────────────────┼─────────┼────────────┤
│  9 │ -11.65 │ absorbing look         │   -5.78 │ movie      │
╘════╧════════╧════════════════════════╧═════════╧════════════╛
============ Sentiment Score:  3
╒════╤════════╤═════════════════╤═════════╤══════════╕
│    │   Most │ Likely          │   Least │ Likely   │
╞════╪════════╪═════════════════╪═════════╪══════════╡
│  0 │ -11.28 │ aaliyah         │   -6.92 │ like     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  1 │ -11.28 │ abandon theater │   -6.88 │ comedy   │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  2 │ -11.28 │ abbreviated     │   -6.85 │ best     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  3 │ -11.28 │ abc             │   -6.83 │ fun      │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  4 │ -11.28 │ abhorrent       │   -6.80 │ love     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  5 │ -11.28 │ abhors          │   -6.79 │ story    │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  6 │ -11.28 │ ability shock   │   -6.32 │ funny    │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  7 │ -11.28 │ able accomplish │   -6.11 │ movie    │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  8 │ -11.28 │ able better     │   -6.11 │ good     │
├────┼────────┼─────────────────┼─────────┼──────────┤
│  9 │ -11.28 │ able project    │   -5.92 │ film     │
╘════╧════════╧═════════════════╧═════════╧══════════╛
============ Sentiment Score:  4
╒════╤════════╤═════════════════╤═════════╤══════════════╕
│    │   Most │ Likely          │   Least │ Likely       │
╞════╪════════╪═════════════════╪═════════╪══════════════╡
│  0 │ -10.79 │ aaliyah         │   -7.23 │ fun          │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  1 │ -10.79 │ abagnale        │   -7.07 │ entertaining │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  2 │ -10.79 │ abagnale antics │   -7.02 │ performance  │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  3 │ -10.79 │ abandon scripts │   -6.97 │ great        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  4 │ -10.79 │ abandon theater │   -6.95 │ performances │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  5 │ -10.79 │ abandoned       │   -6.81 │ good         │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  6 │ -10.79 │ abbott          │   -6.65 │ funny        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  7 │ -10.79 │ abbott ernest   │   -6.50 │ movie        │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  8 │ -10.79 │ abbreviated     │   -6.48 │ best         │
├────┼────────┼─────────────────┼─────────┼──────────────┤
│  9 │ -10.79 │ abc             │   -6.06 │ film         │
╘════╧════════╧═════════════════╧═════════╧══════════════╛
============ Sentiment Score:  0
╒════╤════════╤══════════════╤═════════╤══════════════════╕
│    │   Most │ Likely       │   Least │ Likely           │
╞════╪════════╪══════════════╪═════════╪══════════════════╡
│  0 │  -1.32 │ good good    │    2.09 │ awful            │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  1 │  -1.29 │ variation    │    2.11 │ unwatchable      │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  2 │  -1.10 │ just like    │    2.13 │ unbearable       │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  3 │  -1.08 │ going really │    2.14 │ entirely witless │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  4 │  -1.07 │ man garbage  │    2.17 │ distasteful      │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  5 │  -1.07 │ lightness    │    2.27 │ disgusting       │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  6 │  -1.06 │ awful lot    │    2.31 │ garbage          │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  7 │  -1.05 │ appear       │    2.33 │ charm laughs     │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  8 │  -1.04 │ loving       │    2.45 │ waste            │
├────┼────────┼──────────────┼─────────┼──────────────────┤
│  9 │  -1.00 │ movie way    │    2.68 │ disappointment   │
╘════╧════════╧══════════════╧═════════╧══════════════════╛
============ Sentiment Score:  1
╒════╤════════╤═════════════════════════╤═════════╤══════════════════╕
│    │   Most │ Likely                  │   Least │ Likely           │
╞════╪════════╪═════════════════════════╪═════════╪══════════════════╡
│  0 │  -2.21 │ wo feel                 │    2.14 │ delivered mr     │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  1 │  -2.01 │ unlikable uninteresting │    2.17 │ sadly            │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  2 │  -1.76 │ way does                │    2.19 │ want think       │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  3 │  -1.76 │ contrived overblown     │    2.20 │ overbearing      │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  4 │  -1.64 │ justice awfulness       │    2.21 │ padded           │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  5 │  -1.64 │ willing claustrophobic  │    2.25 │ muddy            │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  6 │  -1.60 │ whiny pathetic          │    2.26 │ does live        │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  7 │  -1.58 │ uniquely                │    2.37 │ note performance │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  8 │  -1.55 │ badly hard              │    2.41 │ lacks            │
├────┼────────┼─────────────────────────┼─────────┼──────────────────┤
│  9 │  -1.52 │ fascinating wo          │    2.48 │ squanders        │
╘════╧════════╧═════════════════════════╧═════════╧══════════════════╛
============ Sentiment Score:  2
╒════╤════════╤══════════════╤═════════╤════════════════════════╕
│    │   Most │ Likely       │   Least │ Likely                 │
╞════╪════════╪══════════════╪═════════╪════════════════════════╡
│  0 │  -2.79 │ remarkable   │    1.62 │ redeeming features     │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  1 │  -2.74 │ perfect      │    1.64 │ dramatic constructs    │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  2 │  -2.74 │ beautifully  │    1.64 │ oscar make             │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  3 │  -2.64 │ delightful   │    1.67 │ age film               │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  4 │  -2.58 │ terrific     │    1.71 │ awful lot              │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  5 │  -2.50 │ stunning     │    1.75 │ cunning                │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  6 │  -2.37 │ hilarious    │    1.79 │ willing claustrophobic │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  7 │  -2.37 │ magnificent  │    1.93 │ ludicrous cult         │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  8 │  -2.33 │ worthwhile   │    2.08 │ budget movie           │
├────┼────────┼──────────────┼─────────┼────────────────────────┤
│  9 │  -2.29 │ disappointed │    2.19 │ enjoy mindless         │
╘════╧════════╧══════════════╧═════════╧════════════════════════╛
============ Sentiment Score:  3
╒════╤════════╤════════════════════╤═════════╤═══════════════════╕
│    │   Most │ Likely             │   Least │ Likely            │
╞════╪════════╪════════════════════╪═════════╪═══════════════════╡
│  0 │  -2.12 │ energetic original │    1.97 │ thanks presence   │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  1 │  -2.09 │ lacks              │    1.99 │ best case         │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  2 │  -2.02 │ intelligent life   │    2.01 │ lives count       │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  3 │  -1.98 │ loses              │    2.05 │ heartening        │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  4 │  -1.93 │ wanting mention    │    2.07 │ realistically     │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  5 │  -1.81 │ zings              │    2.10 │ little film       │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  6 │  -1.80 │ mid                │    2.10 │ wo tapping        │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  7 │  -1.76 │ art direction      │    2.14 │ larger life       │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  8 │  -1.73 │ lacking            │    2.18 │ hard resist       │
├────┼────────┼────────────────────┼─────────┼───────────────────┤
│  9 │  -1.70 │ canny crowd        │    2.18 │ far disappointing │
╘════╧════════╧════════════════════╧═════════╧═══════════════════╛
============ Sentiment Score:  4
╒════╤════════╤════════════════╤═════════╤═════════════╕
│    │   Most │ Likely         │   Least │ Likely      │
╞════╪════════╪════════════════╪═════════╪═════════════╡
│  0 │  -1.62 │ thanks actors  │    2.15 │ masterful   │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  1 │  -1.42 │ argue          │    2.15 │ best war    │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  2 │  -1.28 │ real star      │    2.22 │ brilliant   │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  3 │  -1.22 │ naipaul        │    2.24 │ stunning    │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  4 │  -1.17 │ lovely amazing │    2.29 │ masterfully │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  5 │  -1.17 │ convinced      │    2.36 │ magnificent │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  6 │  -1.13 │ huge cut       │    2.40 │ perfection  │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  7 │  -1.12 │ bore           │    2.41 │ zings       │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  8 │  -1.12 │ does succeed   │    2.47 │ amazing     │
├────┼────────┼────────────────┼─────────┼─────────────┤
│  9 │  -1.11 │ say unburdened │    2.50 │ masterpiece │
╘════╧════════╧════════════════╧═════════╧═════════════╛
In [62]:
df
Out[62]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503
4 mnb V3 0.606658
6 svm V3 0.623815
7 mnb V4 0.597382
8 svm V4 0.630094
9 mnb V4 0.598151
10 svm V4 0.630318
11 mnb V6 0.583606
12 svm V6 0.625433
13 mnb V7 0.583606
14 svm V7 0.625208
15 mnb V8 0.594899
16 svm V8 0.630126
17 mnb V9 0.594707
18 svm V9 0.630270
In [79]:
pred_vec = bigram_cv_v2

test = pd.read_csv("kaggle-sentiment/test.tsv", delimiter='\t')
k_id = test['PhraseId'].values
k_text = test['Phrase'].values

k_vec = bigram_cv_v2.transform(k_text)
k_vec

def get_kaggle_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def do_the_kaggle(X,y,vec):
    X_train_vec, X_test_vec, y_train, y_test = get_kaggle_test_train_vec(X,y,vec)
    svm_clf = LinearSVC(C=1)
    prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)
    kaggle_submission = zip(k_id, prediction)
    outf=open('kaggle_submission_linearSVC_v5.csv', 'w')
    outf.write('PhraseId,Sentiment\n')
    for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')
    outf.close()
    print('prediction complete')

do_the_kaggle(X,y,bigram_cv_v2)
prediction
/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
In [81]:
df
Out[81]:
classifier vectorizer score
0 mnb V1 0.606401
1 svm V1 0.624183
2 mnb V2 0.606978
3 svm V2 0.624503
4 mnb V3 0.606658
6 svm V3 0.623815
7 mnb V4 0.597382
8 svm V4 0.630094
9 mnb V4 0.598151
10 svm V4 0.630318
11 mnb V6 0.583606
12 svm V6 0.625433
13 mnb V7 0.583606
14 svm V7 0.625208
15 mnb V8 0.594899
16 svm V8 0.630126
17 mnb V9 0.594707
18 svm V9 0.630270
19 mnb V5 0.598151
20 svm V5 0.630318
In [ ]: