HW7: Comparing MNB and SVMs

INTRODUCTION

MNB and SVM

How do we take something with 3000 columns and turn it into something meaninful? In short, we, as humans, can't. But computers can!

ANALYSIS & MODELS

About the Data

In [106]:
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC


unigram_bool_vectorizer = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_count_vectorizer = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
bigram_count_vectorizer = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
unigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
bigram_tfidf_vectorizer = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def do_the_xy(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    svm_clf = LinearSVC(C=1)
    svm_clf.fit(X_train_vec,y_train)

    y_pred = svm_clf.predict(X_test_vec)
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    print('=====CONFUSION MATRIX=====')
    print(cm)

    target_names = target_names
    print('=====CLASSIFICATION REPORT=====')
    print(classification_report(y_test, y_pred, target_names=target_names))

    svm_confidence_scores = svm_clf.decision_function(X_test_vec)
    print('=====CONFIDENCE SCORES=====')
    print(svm_confidence_scores[0])
    print('=====SCORES=====')
    print(svm_clf.score(X_test_vec,y_test))
    

With Kaggle Sentiment Data

In [107]:
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

KAGGLE: Unigram Bool Vectorizer

In [108]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_bool_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])
=====CONFUSION MATRIX=====
[[  913  1229   696    79    14]
 [  705  4094  5472   527    26]
 [  190  2111 27063  2324   176]
 [   33   394  6011  5568  1062]
 [    3    51   582  1775  1326]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.50      0.31      0.38      2931
           1       0.52      0.38      0.44     10824
           2       0.68      0.85      0.76     31864
           3       0.54      0.43      0.48     13068
           4       0.51      0.35      0.42      3737

    accuracy                           0.62     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.60      0.62      0.60     62424

=====CONFIDENCE SCORES=====
[-1.04825473 -0.50286701  0.20910622 -0.97398091 -1.15145374]
=====SCORES=====
0.6241830065359477

KAGGLE: Unigram Count Vectorizer

In [109]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_count_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])
=====CONFUSION MATRIX=====
[[  918  1221   697    82    13]
 [  701  4080  5504   514    25]
 [  195  2106 27081  2310   172]
 [   34   396  6048  5533  1057]
 [    3    51   590  1772  1321]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.50      0.31      0.38      2931
           1       0.52      0.38      0.44     10824
           2       0.68      0.85      0.75     31864
           3       0.54      0.42      0.48     13068
           4       0.51      0.35      0.42      3737

    accuracy                           0.62     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.60      0.62      0.60     62424

=====CONFIDENCE SCORES=====
[-1.01718399 -0.50760034  0.22331216 -0.97514731 -1.24718845]
=====SCORES=====
0.6236864026656415

KAGGLE: Bigram Count Vectorizer

In [110]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, bigram_count_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])
=====CONFUSION MATRIX=====
[[ 1039  1276   542    63    11]
 [  864  4555  4911   457    37]
 [  252  2470 26246  2700   196]
 [   28   358  5383  6034  1265]
 [    5    27   452  1793  1460]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.47      0.35      0.41      2931
           1       0.52      0.42      0.47     10824
           2       0.70      0.82      0.76     31864
           3       0.55      0.46      0.50     13068
           4       0.49      0.39      0.44      3737

    accuracy                           0.63     62424
   macro avg       0.55      0.49      0.51     62424
weighted avg       0.61      0.63      0.62     62424

=====CONFIDENCE SCORES=====
[-1.35329329 -0.56433728  0.50420228 -0.98431383 -1.1448782 ]
=====SCORES=====
0.6301102140202486
/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)

KAGGLE: Unigram TFIDF Vectorizer

In [111]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_tfidf_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])
=====CONFUSION MATRIX=====
[[  795  1387   624   117     8]
 [  589  4336  5245   629    25]
 [  163  2299 26557  2684   161]
 [   24   408  5604  6220   812]
 [    2    40   551  2010  1134]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.51      0.27      0.35      2931
           1       0.51      0.40      0.45     10824
           2       0.69      0.83      0.75     31864
           3       0.53      0.48      0.50     13068
           4       0.53      0.30      0.39      3737

    accuracy                           0.63     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.61      0.63      0.61     62424

=====CONFIDENCE SCORES=====
[-1.01487712 -0.38031632  0.16541388 -0.97047847 -1.23293474]
=====SCORES=====
0.6254325259515571

KAGGLE: Bigram TFIDF Vectorizer

In [112]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, bigram_tfidf_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, [0,1,2,3,4],['0','1','2','3','4'])
=====CONFUSION MATRIX=====
[[  916  1373   565    69     8]
 [  696  4666  4947   493    22]
 [  217  2507 26156  2827   157]
 [   25   364  5343  6334  1002]
 [    5    32   475  1962  1263]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.49      0.31      0.38      2931
           1       0.52      0.43      0.47     10824
           2       0.70      0.82      0.75     31864
           3       0.54      0.48      0.51     13068
           4       0.52      0.34      0.41      3737

    accuracy                           0.63     62424
   macro avg       0.55      0.48      0.51     62424
weighted avg       0.61      0.63      0.62     62424

=====CONFIDENCE SCORES=====
[-1.17972335 -0.4138446   0.29125406 -0.87403192 -1.04112914]
=====SCORES=====
0.6301262334999359

With Joker Data

In [93]:
neg = get_data_from_files('../NEG_JK_E/')
pos = get_data_from_files('../POS_JK_E/')
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
y=all_df['PoN'].values
X=all_df[0].values
=====CONFUSION MATRIX=====
[[11 13]
 [ 4 12]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           P       0.48      0.75      0.59        16
           N       0.73      0.46      0.56        24

    accuracy                           0.57        40
   macro avg       0.61      0.60      0.57        40
weighted avg       0.63      0.57      0.57        40

=====CONFIDENCE SCORES=====
0.5249293629884589
=====SCORES=====
0.575

JOKER: Unigram Count Vectorizer`

In [96]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_count_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, ['P','N'],['P','N'])
=====CONFUSION MATRIX=====
[[11 13]
 [ 4 12]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           P       0.48      0.75      0.59        16
           N       0.73      0.46      0.56        24

    accuracy                           0.57        40
   macro avg       0.61      0.60      0.57        40
weighted avg       0.63      0.57      0.57        40

=====CONFIDENCE SCORES=====
0.5249216209994625
=====SCORES=====
0.575

JOKER: Unigram Tfidf Vectorizer

In [94]:
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y, unigram_tfidf_vectorizer)
do_the_xy(X_train_vec, X_test_vec, y_train, y_test, ['P','N'],['P','N'])
=====CONFUSION MATRIX=====
[[13 11]
 [ 4 12]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           P       0.52      0.75      0.62        16
           N       0.76      0.54      0.63        24

    accuracy                           0.62        40
   macro avg       0.64      0.65      0.62        40
weighted avg       0.67      0.62      0.63        40

=====CONFIDENCE SCORES=====
0.056820071563634
=====SCORES=====
0.625