# HW7: Comparing MNB & SVM with Kaggle Sentiment Data

## OVERVIEW

---
### VECTORIZERS USED:
    CountVectorizer
    TfidfVectorizer

### MODELS USED:
    Multinomial Naive Bayes (MNB)
    Support Vector Machines (SVM)
---

---
#### VECTORIZATION PARAMS:
    Binary
    Stopwords
    Unigrams, Bigrams
    Min & Max df
---

#### TODO:
    Stemming?
    Vadar + TextBlob

### FUNCTION & PACKAGE PARTY

In [120]:
## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

## =======================================================
## PREPROCESSING
## =======================================================

# FIRST - removing anything with 3 or fewer characters
# def my_preprocessor(doc):
# #     print('PREPROCESSING!!!!!')
#     if len(doc) > 3:
#         return(doc)
#     else:
#         return('none')
    
def my_preprocessor(doc):
#     print('PREPROCESSING!!!!!')
    if len(doc) > 2:
        return(doc)
    else:
        return('empty')

## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', 
                                     token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', 
                             token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )

bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', 
                                token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## ----- VECTORIZERS with PREPROCESSING

unigram_tv_v3 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', 
                                preprocessor=my_preprocessor, token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
bigram_tv_v3 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5,  
                               preprocessor=my_preprocessor, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv_v4 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2),  
                               preprocessor=my_preprocessor, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

bigram_tv_v5 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=3,
                               preprocessor=my_preprocessor, stop_words='english', 
                               token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')

## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
    clf = classifier
    clf.fit(X_train_vec,y_train)
    y_pred = clf.predict(X_test_vec)
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    score = clf.score(X_test_vec,y_test)
    return clf, score, report
    
def get_model(X, y, labels, target_names, classifier, vec):
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
    return model, score, report
    
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd

def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
        df3 = pd.concat([df1, df2], axis=1)
        print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))

def update_big_df(big_df, new_row):
    big_df.append(new_row)
    df = pd.DataFrame(big_df)
    df = df.drop_duplicates()
    return df





### DATA GOES HERE:

In [122]:
# import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
big_df = []



In [144]:
train=pd.read_csv("../HW2/hw7_data_sentiment.csv")
train.head()
# y=train['labels'].values
# X=train['pruned'].values

def remove_na(string):
#     print(type(string))
    if type(string) == str:
        return string
    else:
        return "empty"
train['pruned_2'] = train.apply(lambda x: remove_na(x['pruned']), axis= 1)

y=train['labels'].values
X=train['pruned_2'].values


In [153]:
big_df = []
train.head()

Unnamed: 0,0,labels,tokens,num_tokens,sentences,num_sentences,no_sw,num_no_sw,topwords_unfil,topwords_fil,...,v_pos_fd,bow,bow_nosw,diy_cleaner,pruned,nltk_negs,unigram_feats,bigram_feats,bigram_feats_neg,pruned_2
0,A series of escapades demonstrating the adage ...,1,"['a', 'series', 'of', 'escapades', 'demonstrat...",35,['A series of escapades demonstrating the adag...,1,"['series', 'escapades', 'demonstrating', 'adag...",15,"[('of', 4), ('the', 3), ('a', 2), ('is', 2), (...","[('good', 2), ('series', 1), ('escapades', 1),...",...,0.307,"Counter({'of': 4, 'the': 3, 'a': 2, 'is': 2, '...","Counter({'good': 2, 'series': 1, 'escapades': ...",a series of escapades demonstrating the adage ...,series escapades demonstrating adage that what...,"['a', 'series', 'of', 'escapades', 'demonstrat...","['the', 'of', 'is', 'good', 'for', 'of_NEG', '...","['a_series', 'series_of', 'of_escapades', 'esc...","['a_series', 'series_of', 'of_escapades', 'esc...",series escapades demonstrating adage that what...
1,A series of escapades demonstrating the adage ...,2,"['a', 'series', 'of', 'escapades', 'demonstrat...",14,['A series of escapades demonstrating the adag...,1,"['series', 'escapades', 'demonstrating', 'adag...",6,"[('the', 2), ('a', 1), ('series', 1), ('of', 1...","[('series', 1), ('escapades', 1), ('demonstrat...",...,0.367,"Counter({'the': 2, 'a': 1, 'series': 1, 'of': ...","Counter({'series': 1, 'escapades': 1, 'demonst...",a series of escapades demonstrating the adage ...,series escapades demonstrating adage that what...,"['a', 'series', 'of', 'escapades', 'demonstrat...","['the', 'a', 'series', 'of', 'escapades', 'dem...","['a_series', 'series_of', 'of_escapades', 'esc...","['a_series', 'series_of', 'of_escapades', 'esc...",series escapades demonstrating adage that what...
2,A series,2,"['a', 'series']",2,['A series'],1,['series'],1,"[('a', 1), ('series', 1)]","[('series', 1)]",...,0.0,"Counter({'a': 1, 'series': 1})",Counter({'series': 1}),a series a series a series,series series series,"['a', 'series']","['a', 'series']",['a_series'],['a_series'],series series series
3,A,2,['a'],1,['A'],1,[],0,"[('a', 1)]",[],...,0.0,Counter({'a': 1}),Counter(),a a a,,['a'],['a'],[],[],empty
4,series,2,['series'],1,['series'],1,['series'],1,"[('series', 1)]","[('series', 1)]",...,0.0,Counter({'series': 1}),Counter({'series': 1}),series series series,series series series,['series'],['series'],[],[],series series series


# TASK 1

## With Preprocessing

In [147]:
vec = unigram_tv_v3
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df

╒════╤════════╤═══════════╤═════════╤════════════╕
│    │   Most │ Likely    │   Least │ Likely     │
╞════╪════════╪═══════════╪═════════╪════════════╡
│  0 │  -9.92 │ aaliyah   │   -6.55 │ time       │
├────┼────────┼───────────┼─────────┼────────────┤
│  1 │  -9.92 │ abandoned │   -6.55 │ characters │
├────┼────────┼───────────┼─────────┼────────────┤
│  2 │  -9.92 │ abbott    │   -6.47 │ comedy     │
├────┼────────┼───────────┼─────────┼────────────┤
│  3 │  -9.92 │ abdul     │   -6.45 │ dull       │
├────┼────────┼───────────┼─────────┼────────────┤
│  4 │  -9.92 │ abel      │   -6.37 │ minutes    │
├────┼────────┼───────────┼─────────┼────────────┤
│  5 │  -9.92 │ ably      │   -6.12 │ worst      │
├────┼────────┼───────────┼─────────┼────────────┤
│  6 │  -9.92 │ aborted   │   -6.09 │ just       │
├────┼────────┼───────────┼─────────┼────────────┤
│  7 │  -9.92 │ abound    │   -5.92 │ like       │
├────┼────────┼───────────┼─────────┼────────────┤
│  8 │  -9.92 │ abrahams  │   -

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.579779


In [148]:
vec = bigram_tv_v3
classifier = mnb

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df

╒════╤════════╤═══════════════════╤═════════╤════════════╕
│    │   Most │ Likely            │   Least │ Likely     │
╞════╪════════╪═══════════════════╪═════════╪════════════╡
│  0 │ -10.71 │ aaliyah           │   -7.63 │ characters │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  1 │ -10.71 │ abandon political │   -7.63 │ time       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  2 │ -10.71 │ abandoned         │   -7.53 │ comedy     │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  3 │ -10.71 │ abbott            │   -7.48 │ minutes    │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  4 │ -10.71 │ abdul             │   -7.47 │ dull       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  5 │ -10.71 │ abdul malik       │   -7.27 │ worst      │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  6 │ -10.71 │ abel              │   -7.12 │ just       │
├────┼────────┼───────────────────┼─────────┼───────────

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.579779
1,mnb,V1,0.593444


In [149]:
vec = bigram_tv_v3
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

╒════╤════════╤══════════════════════╤═════════╤══════════════════════╕
│    │   Most │ Likely               │   Least │ Likely               │
╞════╪════════╪══════════════════════╪═════════╪══════════════════════╡
│  0 │  -1.32 │ hawke                │    2.00 │ entirely witless     │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  1 │  -1.22 │ loving               │    2.01 │ hideously            │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  2 │  -1.21 │ works minutes        │    2.02 │ time stinker         │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  3 │  -1.13 │ flick film           │    2.06 │ disgusting           │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  4 │  -1.13 │ movie does           │    2.08 │ minded stereotypical │
├────┼────────┼──────────────────────┼─────────┼──────────────────────┤
│  5 │  -1.12 │ mind right           │    2.12 │ unappealing    

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.579779
1,mnb,V1,0.593444
2,svm,V1,0.628302


In [150]:
vec = bigram_tv_v4
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

╒════╤════════╤═════════════════╤═════════╤════════════════╕
│    │   Most │ Likely          │   Least │ Likely         │
╞════╪════════╪═════════════════╪═════════╪════════════════╡
│  0 │  -1.47 │ stupid stupid   │    1.83 │ disaster       │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  1 │  -1.14 │ movie does      │    1.83 │ unmemorable    │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  2 │  -1.13 │ worse worse     │    1.87 │ awful          │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  3 │  -1.06 │ plain boring    │    1.87 │ unappealing    │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  4 │  -1.06 │ fails fails     │    1.89 │ premise just   │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  5 │  -1.03 │ waste waste     │    1.97 │ worst          │
├────┼────────┼─────────────────┼─────────┼────────────────┤
│  6 │  -0.98 │ comedy year     │    1.97 │ unwatchable    │
├────┼────────┼─────────

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.579779
1,mnb,V1,0.593444
2,svm,V1,0.628302
3,svm,V1,0.629617


In [151]:
vec = bigram_tv_v5
classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)

df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df

╒════╤════════╤════════════════════════╤═════════╤════════════════╕
│    │   Most │ Likely                 │   Least │ Likely         │
╞════╪════════╪════════════════════════╪═════════╪════════════════╡
│  0 │  -1.30 │ plain boring           │    1.94 │ repugnant      │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  1 │  -1.27 │ stupid stupid          │    1.95 │ unmemorable    │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  2 │  -1.19 │ movie does             │    1.99 │ pathetic       │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  3 │  -1.09 │ badly badly            │    2.03 │ disgusting     │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  4 │  -1.07 │ comedy year            │    2.06 │ worst          │
├────┼────────┼────────────────────────┼─────────┼────────────────┤
│  5 │  -1.06 │ incompetent incoherent │    2.06 │ premise just   │
├────┼────────┼────────────────────────┼────────

Unnamed: 0,classifier,vectorizer,score
0,mnb,V1,0.579779
1,mnb,V1,0.593444
2,svm,V1,0.628302
3,svm,V1,0.629617
4,svm,V1,0.630428


In [152]:
# 	classifier	vectorizer	score
# 0	mnb	V1	0.583317
# 1	mnb	V1	0.594499
# 2	svm	V1	0.629566
# 3	svm	V1	0.636662

## TEST 1 -- MNB & SVM with Vectorizer 1


In [33]:
# vec = unigram_bool_cv_v1
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
# df

In [34]:
# vec = unigram_bool_cv_v1
# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
# df

NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not. 

## TEST 2 -- MNB & SVM with Vectorizer 2



In [35]:
# vec = unigram_bool_cv_v2
# classifier = mnb


# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
# df

In [36]:
# vec = unigram_bool_cv_v2
# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
# df

## TEST 3 -- MNB & SVM with Vectorizer 3


In [37]:
# vec = unigram_cv
# classifier = mnb


# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
# df

In [38]:
# vec = unigram_cv
# classifier = svm


# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)

# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
# df

## TEST 4 -- MNB & SVM with Vectorizer 4


In [39]:
# vec = bigram_cv
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
# df

In [40]:
df

[]

## TEST 5 -- MNB & SVM with Vectorizer 5


In [43]:
vec = bigram_cv_v2
classifier = mnb


model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})

classifier = svm

model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})

╒════╤════════╤═══════════════════╤═════════╤════════════╕
│    │   Most │ Likely            │   Least │ Likely     │
╞════╪════════╪═══════════════════╪═════════╪════════════╡
│  0 │ -11.16 │ aaliyah           │   -6.62 │ time       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  1 │ -11.16 │ abagnale          │   -6.61 │ characters │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  2 │ -11.16 │ abagnale antics   │   -6.60 │ minutes    │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  3 │ -11.16 │ abandon political │   -6.60 │ story      │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  4 │ -11.16 │ abandoned         │   -6.59 │ comedy     │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  5 │ -11.16 │ abbreviated       │   -6.37 │ just       │
├────┼────────┼───────────────────┼─────────┼────────────┤
│  6 │ -11.16 │ abel              │   -5.82 │ like       │
├────┼────────┼───────────────────┼─────────┼───────────

NameError: name 'big_df' is not defined

In [None]:
df

## TEST 6 -- MNB & SVM with Vectorizer 6

In [None]:
# vec = unigram_tv
# classifier = mnb


# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})

In [None]:
df

## TEST 7 -- MNB & SVM with Vectorizer 7

In [None]:
# vec = unigram_tv_v2
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})

In [None]:
df

## TEST 8 -- MNB & SVM with Vectorizer 8

In [None]:
# vec = bigram_tv
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})

In [None]:
df

## TEST 9 -- MNB & SVM with Vectorizer 9

In [None]:
# vec = bigram_tv_v2
# classifier = mnb

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})

# classifier = svm

# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})

In [None]:
df

In [101]:
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

# pred_vec = bigram_tv_v3 # 60.4
# pred_vec = bigram_tv_v4 # 60.569
pred_vec = bigram_tv_v4 # removing words < 2 60.584


test = pd.read_csv("kaggle-sentiment/test.tsv", delimiter='\t')
k_id = test['PhraseId']
k_text = test['Phrase']

# k_vec = bigram_tv_v3.transform(k_text)
# k_vec

def get_kaggle_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
#     X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, y_train,

def do_the_kaggle(X,y,vec):
    X_train_vec, y_train = get_kaggle_test_train_vec(X,y,vec)
    svm_clf = LinearSVC(C=1)
    k_vec = pred_vec.transform(k_text)
    print(len(X), X_train_vec.shape, k_vec.shape)

    prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)
    kaggle_submission = zip(k_id, prediction)
    outf=open('kaggle_submission_linearSVC_v10.csv', 'w')
    outf.write('PhraseId,Sentiment\n')
    for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')
    outf.close()
    print('prediction complete')

do_the_kaggle(X,y,pred_vec)

156060 (117045, 78021) (66292, 78021)
prediction complete
