HW7: Comparing MNB & SVM with Kaggle Sentiment Data

OVERVIEW


VECTORIZERS USED:

CountVectorizer
TfidfVectorizer

MODELS USED:

Multinomial Naive Bayes (MNB)
Support Vector Machines (SVM)


VECTORIZATION PARAMS:

Binary
Stopwords
Unigrams, Bigrams
Min & Max df

TODO:

Stemming?
Vadar + TextBlob

FUNCTION & PACKAGE PARTY

In [113]:
## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize

## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

## ----- VECTORIZORS
unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')

## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
    clf = classifier
    clf.fit(X_train_vec,y_train)
    print(clf.score(X_test_vec,y_test))
    return clf
    
def get_model(X, y, labels, target_names, classifier, vec):
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    model = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
    return model
    
## =======================================================
## VISUALIZING
## =======================================================

def return_features(vec, model):
    for i,feature_probability in enumerate(model.coef_):
        print('============ Sentiment Score: ', i)
        print('++++++++++++Most likely')        
        print(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        print('------------Least likely')
        print(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
        

DATA GOES HERE:

In [114]:
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

TASK 1

In [115]:
vec = unigram_bool_cv
classifier = mnb

model = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
0.606401384083045
============ Sentiment Score:  0
++++++++++++Most likely
[(-10.479004810717253, '102'), (-10.479004810717253, '10th'), (-10.479004810717253, '127'), (-10.479004810717253, '13th'), (-10.479004810717253, '14'), (-10.479004810717253, '16'), (-10.479004810717253, '163'), (-10.479004810717253, '168'), (-10.479004810717253, '170'), (-10.479004810717253, '1790')]
------------Least likely
[(-10.479004810717253, '102'), (-10.479004810717253, '10th'), (-10.479004810717253, '127'), (-10.479004810717253, '13th'), (-10.479004810717253, '14'), (-10.479004810717253, '16'), (-10.479004810717253, '163'), (-10.479004810717253, '168'), (-10.479004810717253, '170'), (-10.479004810717253, '1790')]
============ Sentiment Score:  1
++++++++++++Most likely
[(-11.330887867855907, '000'), (-11.330887867855907, '10th'), (-11.330887867855907, '127'), (-11.330887867855907, '14'), (-11.330887867855907, '168'), (-11.330887867855907, '1790'), (-11.330887867855907, '1915'), (-11.330887867855907, '1920'), (-11.330887867855907, '1933'), (-11.330887867855907, '1937')]
------------Least likely
[(-11.330887867855907, '000'), (-11.330887867855907, '10th'), (-11.330887867855907, '127'), (-11.330887867855907, '14'), (-11.330887867855907, '168'), (-11.330887867855907, '1790'), (-11.330887867855907, '1915'), (-11.330887867855907, '1920'), (-11.330887867855907, '1933'), (-11.330887867855907, '1937')]
============ Sentiment Score:  2
++++++++++++Most likely
[(-11.837216670289285, 'abroad'), (-11.837216670289285, 'acclaim'), (-11.837216670289285, 'acumen'), (-11.837216670289285, 'adding'), (-11.837216670289285, 'admirers'), (-11.837216670289285, 'affirms'), (-11.837216670289285, 'aggravating'), (-11.837216670289285, 'aimlessly'), (-11.837216670289285, 'amaze'), (-11.837216670289285, 'ambiguities')]
------------Least likely
[(-11.837216670289285, 'abroad'), (-11.837216670289285, 'acclaim'), (-11.837216670289285, 'acumen'), (-11.837216670289285, 'adding'), (-11.837216670289285, 'admirers'), (-11.837216670289285, 'affirms'), (-11.837216670289285, 'aggravating'), (-11.837216670289285, 'aimlessly'), (-11.837216670289285, 'amaze'), (-11.837216670289285, 'ambiguities')]
============ Sentiment Score:  3
++++++++++++Most likely
[(-11.473165406396351, '102'), (-11.473165406396351, '104'), (-11.473165406396351, '105'), (-11.473165406396351, '110'), (-11.473165406396351, '120'), (-11.473165406396351, '127'), (-11.473165406396351, '140'), (-11.473165406396351, '146'), (-11.473165406396351, '1915'), (-11.473165406396351, '1959')]
------------Least likely
[(-11.473165406396351, '102'), (-11.473165406396351, '104'), (-11.473165406396351, '105'), (-11.473165406396351, '110'), (-11.473165406396351, '120'), (-11.473165406396351, '127'), (-11.473165406396351, '140'), (-11.473165406396351, '146'), (-11.473165406396351, '1915'), (-11.473165406396351, '1959')]
============ Sentiment Score:  4
++++++++++++Most likely
[(-10.625732263919769, '000'), (-10.625732263919769, '101'), (-10.625732263919769, '102'), (-10.625732263919769, '103'), (-10.625732263919769, '104'), (-10.625732263919769, '105'), (-10.625732263919769, '10th'), (-10.625732263919769, '110'), (-10.625732263919769, '112'), (-10.625732263919769, '12')]
------------Least likely
[(-10.625732263919769, '000'), (-10.625732263919769, '101'), (-10.625732263919769, '102'), (-10.625732263919769, '103'), (-10.625732263919769, '104'), (-10.625732263919769, '105'), (-10.625732263919769, '10th'), (-10.625732263919769, '110'), (-10.625732263919769, '112'), (-10.625732263919769, '12')]
In [ ]: