HW7: Comparing MNB and SVMs

INTRODUCTION

MNB and SVM

How do we take something with 3000 columns and turn it into something meaninful? In short, we, as humans, can't. But computers can!

ANALYSIS & MODELS

About the Data

In [82]:
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC


unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
gram12_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def do_the_xy(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    svm_clf = LinearSVC(C=1)
    svm_clf.fit(X_train_vec,y_train)

    y_pred = svm_clf.predict(X_test_vec)
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    print('=====CONFUSION MATRIX=====')
    print(cm)

    target_names = target_names
    print('=====CLASSIFICATION REPORT=====')
    print(classification_report(y_test, y_pred, target_names=target_names))

    svm_confidence_scores = svm_clf.decision_function(X_test_vec)
    print('=====CONFIDENCE SCORES=====')
    print(svm_confidence_scores[0])
    print('=====SCORES=====')
    print(svm_clf.score(X_test_vec,y_test))
    

With Kaggle Sentiment Data

In [90]:
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

KAGGLE: Unigram Count Vectorizer

In [91]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_count_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])
=====CONFUSION MATRIX=====
[[  918  1221   697    82    13]
 [  701  4080  5504   514    25]
 [  195  2106 27081  2310   172]
 [   34   396  6048  5533  1057]
 [    3    51   590  1772  1321]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.50      0.31      0.38      2931
           1       0.52      0.38      0.44     10824
           2       0.68      0.85      0.75     31864
           3       0.54      0.42      0.48     13068
           4       0.51      0.35      0.42      3737

    accuracy                           0.62     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.60      0.62      0.60     62424

=====CONFIDENCE SCORES=====
[-1.01718406 -0.50760027  0.22331222 -0.97514719 -1.24718837]
=====SCORES=====
0.6236864026656415

KAGGLE: Unigram TFIDF Vectorizer

In [92]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_tfidf_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])
=====CONFUSION MATRIX=====
[[  795  1387   624   117     8]
 [  589  4336  5245   629    25]
 [  163  2299 26557  2684   161]
 [   24   408  5604  6220   812]
 [    2    40   551  2010  1134]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.51      0.27      0.35      2931
           1       0.51      0.40      0.45     10824
           2       0.69      0.83      0.75     31864
           3       0.53      0.48      0.50     13068
           4       0.53      0.30      0.39      3737

    accuracy                           0.63     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.61      0.63      0.61     62424

=====CONFIDENCE SCORES=====
[-1.01488761 -0.380324    0.1654111  -0.97048696 -1.23293179]
=====SCORES=====
0.6254325259515571

With Joker Data

In [93]:
neg = get_data_from_files('../NEG_JK_E/')
pos = get_data_from_files('../POS_JK_E/')
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
y=all_df['PoN'].values
X=all_df[0].values
=====CONFUSION MATRIX=====
[[11 13]
 [ 4 12]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           P       0.48      0.75      0.59        16
           N       0.73      0.46      0.56        24

    accuracy                           0.57        40
   macro avg       0.61      0.60      0.57        40
weighted avg       0.63      0.57      0.57        40

=====CONFIDENCE SCORES=====
0.5249293629884589
=====SCORES=====
0.575

JOKER: Unigram Count Vectorizer`

In [96]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_count_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, ['P','N'],['P','N'])
=====CONFUSION MATRIX=====
[[11 13]
 [ 4 12]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           P       0.48      0.75      0.59        16
           N       0.73      0.46      0.56        24

    accuracy                           0.57        40
   macro avg       0.61      0.60      0.57        40
weighted avg       0.63      0.57      0.57        40

=====CONFIDENCE SCORES=====
0.5249216209994625
=====SCORES=====
0.575

JOKER: Unigram Tfidf Vectorizer

In [94]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_tfidf_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, ['P','N'],['P','N'])
=====CONFUSION MATRIX=====
[[13 11]
 [ 4 12]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           P       0.52      0.75      0.62        16
           N       0.76      0.54      0.63        24

    accuracy                           0.62        40
   macro avg       0.64      0.65      0.62        40
weighted avg       0.67      0.62      0.63        40

=====CONFIDENCE SCORES=====
0.056820071563634
=====SCORES=====
0.625