In [ ]:
 
In [3]:
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer


# unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')

vectorizers = [
    CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english'),
    CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english'),
    CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english'),
    TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english'),
    TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, max_df=0.50, stop_words='english'),
    TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
]

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    mnb_clf = MultinomialNB()
    mnb_clf.fit(X_train_vec, y_train)
    print('*****MNB*****')
    y_pred = mnb_clf.predict(X_test_vec)
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    print('*****CONFUSION MATRIX*****')
    print(cm)
    target_names = target_names
    print('*****CLASSIFICATION REPORT*****')
    print(classification_report(y_test, y_pred, target_names=target_names))
    print('*****SCORES*****')
    print(mnb_clf.score(X_test_vec, y_test))
    
def run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    svm_clf = LinearSVC(C=1)
    svm_clf.fit(X_train_vec,y_train)
    print('=====SVM=====')
    y_pred = svm_clf.predict(X_test_vec)
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    print('=====CONFUSION MATRIX=====')
    print(cm)

    target_names = target_names
    print('=====CLASSIFICATION REPORT=====')
    print(classification_report(y_test, y_pred, target_names=target_names))

    svm_confidence_scores = svm_clf.decision_function(X_test_vec)
    print('=====CONFIDENCE SCORES=====')
    print(svm_confidence_scores[0])
    print('=====SCORES=====')
    print(svm_clf.score(X_test_vec,y_test))
    
def do_the_thing(X,y,labels, target_names):
    for i,vec in enumerate(vectorizers):
        vec_type = str(vec).split('(')[0]
        print('++'* 20)
        print('Vectorizer Scores for '+ str(i)+ '_' + vec_type)
        print('++'* 20)
        X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
        run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
        run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
    
In [4]:
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])
++++++++++++++++++++++++++++++++++++++++
Vectorizer Scores for 0_CountVectorizer
++++++++++++++++++++++++++++++++++++++++
*****MNB*****
*****CONFUSION MATRIX*****
[[  733  1264   817   106    11]
 [  602  4132  5411   649    30]
 [  246  2397 25756  3226   239]
 [   19   454  5580  6248   767]
 [    1    54   725  1972   985]]
*****CLASSIFICATION REPORT*****
              precision    recall  f1-score   support

           0       0.46      0.25      0.32      2931
           1       0.50      0.38      0.43     10824
           2       0.67      0.81      0.73     31864
           3       0.51      0.48      0.49     13068
           4       0.48      0.26      0.34      3737

    accuracy                           0.61     62424
   macro avg       0.53      0.44      0.47     62424
weighted avg       0.59      0.61      0.59     62424

*****SCORES*****
0.606401384083045
=====SVM=====
=====CONFUSION MATRIX=====
[[  913  1229   696    79    14]
 [  705  4094  5472   527    26]
 [  190  2111 27063  2324   176]
 [   33   394  6011  5568  1062]
 [    3    51   582  1775  1326]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.50      0.31      0.38      2931
           1       0.52      0.38      0.44     10824
           2       0.68      0.85      0.76     31864
           3       0.54      0.43      0.48     13068
           4       0.51      0.35      0.42      3737

    accuracy                           0.62     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.60      0.62      0.60     62424

=====CONFIDENCE SCORES=====
[-1.0482547  -0.50286654  0.2091063  -0.97398092 -1.15145378]
=====SCORES=====
0.6241830065359477
++++++++++++++++++++++++++++++++++++++++
Vectorizer Scores for 1_CountVectorizer
++++++++++++++++++++++++++++++++++++++++
*****MNB*****
*****CONFUSION MATRIX*****
[[  742  1276   797   105    11]
 [  614  4126  5397   655    32]
 [  248  2385 25756  3239   236]
 [   19   456  5570  6253   770]
 [    1    53   729  1977   977]]
*****CLASSIFICATION REPORT*****
              precision    recall  f1-score   support

           0       0.46      0.25      0.33      2931
           1       0.50      0.38      0.43     10824
           2       0.67      0.81      0.73     31864
           3       0.51      0.48      0.49     13068
           4       0.48      0.26      0.34      3737

    accuracy                           0.61     62424
   macro avg       0.52      0.44      0.47     62424
weighted avg       0.59      0.61      0.59     62424

*****SCORES*****
0.606401384083045
/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
=====SVM=====
=====CONFUSION MATRIX=====
[[  918  1221   697    82    13]
 [  701  4080  5504   514    25]
 [  195  2106 27081  2310   172]
 [   34   396  6048  5533  1057]
 [    3    51   590  1772  1321]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.50      0.31      0.38      2931
           1       0.52      0.38      0.44     10824
           2       0.68      0.85      0.75     31864
           3       0.54      0.42      0.48     13068
           4       0.51      0.35      0.42      3737

    accuracy                           0.62     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.60      0.62      0.60     62424

=====CONFIDENCE SCORES=====
[-1.01718415 -0.5076005   0.22331207 -0.97514731 -1.24718848]
=====SCORES=====
0.6236864026656415
++++++++++++++++++++++++++++++++++++++++
Vectorizer Scores for 2_CountVectorizer
++++++++++++++++++++++++++++++++++++++++
*****MNB*****
*****CONFUSION MATRIX*****
[[  867  1253   725    69    17]
 [  786  4440  4943   609    46]
 [  459  2961 24437  3600   407]
 [   41   513  5082  6375  1057]
 [    6    46   602  1911  1172]]
*****CLASSIFICATION REPORT*****
              precision    recall  f1-score   support

           0       0.40      0.30      0.34      2931
           1       0.48      0.41      0.44     10824
           2       0.68      0.77      0.72     31864
           3       0.51      0.49      0.50     13068
           4       0.43      0.31      0.36      3737

    accuracy                           0.60     62424
   macro avg       0.50      0.45      0.47     62424
weighted avg       0.58      0.60      0.59     62424

*****SCORES*****
0.5973824170190952
/usr/local/lib/python3.7/site-packages/sklearn/svm/base.py:929: ConvergenceWarning: Liblinear failed to converge, increase the number of iterations.
  "the number of iterations.", ConvergenceWarning)
=====SVM=====
=====CONFUSION MATRIX=====
[[ 1039  1276   542    63    11]
 [  864  4555  4911   457    37]
 [  252  2470 26246  2700   196]
 [   28   358  5383  6034  1265]
 [    5    27   452  1794  1459]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.47      0.35      0.41      2931
           1       0.52      0.42      0.47     10824
           2       0.70      0.82      0.76     31864
           3       0.55      0.46      0.50     13068
           4       0.49      0.39      0.44      3737

    accuracy                           0.63     62424
   macro avg       0.55      0.49      0.51     62424
weighted avg       0.61      0.63      0.62     62424

=====CONFIDENCE SCORES=====
[-1.35329509 -0.56433734  0.50417972 -0.98434221 -1.14487822]
=====SCORES=====
0.6300941945405614
++++++++++++++++++++++++++++++++++++++++
Vectorizer Scores for 3_TfidfVectorizer
++++++++++++++++++++++++++++++++++++++++
*****MNB*****
*****CONFUSION MATRIX*****
[[  107  1144  1613    67     0]
 [   61  2580  7821   361     1]
 [   19  1168 28673  1987    17]
 [    0   147  7942  4883    96]
 [    0    11  1374  2164   188]]
*****CLASSIFICATION REPORT*****
              precision    recall  f1-score   support

           0       0.57      0.04      0.07      2931
           1       0.51      0.24      0.33     10824
           2       0.60      0.90      0.72     31864
           3       0.52      0.37      0.43     13068
           4       0.62      0.05      0.09      3737

    accuracy                           0.58     62424
   macro avg       0.57      0.32      0.33     62424
weighted avg       0.57      0.58      0.53     62424

*****SCORES*****
0.5836056644880174
=====SVM=====
=====CONFUSION MATRIX=====
[[  795  1387   624   117     8]
 [  589  4336  5245   629    25]
 [  163  2299 26557  2684   161]
 [   24   408  5604  6220   812]
 [    2    40   551  2010  1134]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.51      0.27      0.35      2931
           1       0.51      0.40      0.45     10824
           2       0.69      0.83      0.75     31864
           3       0.53      0.48      0.50     13068
           4       0.53      0.30      0.39      3737

    accuracy                           0.63     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.61      0.63      0.61     62424

=====CONFIDENCE SCORES=====
[-1.01488208 -0.38030889  0.16542161 -0.97048325 -1.23292618]
=====SCORES=====
0.6254325259515571
++++++++++++++++++++++++++++++++++++++++
Vectorizer Scores for 4_TfidfVectorizer
++++++++++++++++++++++++++++++++++++++++
*****MNB*****
*****CONFUSION MATRIX*****
[[  107  1144  1613    67     0]
 [   61  2580  7821   361     1]
 [   19  1168 28673  1987    17]
 [    0   147  7942  4883    96]
 [    0    11  1374  2164   188]]
*****CLASSIFICATION REPORT*****
              precision    recall  f1-score   support

           0       0.57      0.04      0.07      2931
           1       0.51      0.24      0.33     10824
           2       0.60      0.90      0.72     31864
           3       0.52      0.37      0.43     13068
           4       0.62      0.05      0.09      3737

    accuracy                           0.58     62424
   macro avg       0.57      0.32      0.33     62424
weighted avg       0.57      0.58      0.53     62424

*****SCORES*****
0.5836056644880174
=====SVM=====
=====CONFUSION MATRIX=====
[[  795  1387   624   117     8]
 [  589  4336  5245   629    25]
 [  163  2299 26557  2684   161]
 [   24   408  5604  6220   812]
 [    2    40   551  2010  1134]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.51      0.27      0.35      2931
           1       0.51      0.40      0.45     10824
           2       0.69      0.83      0.75     31864
           3       0.53      0.48      0.50     13068
           4       0.53      0.30      0.39      3737

    accuracy                           0.63     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.61      0.63      0.61     62424

=====CONFIDENCE SCORES=====
[-1.01488249 -0.38032514  0.16541625 -0.97048002 -1.23292607]
=====SCORES=====
0.6254325259515571
++++++++++++++++++++++++++++++++++++++++
Vectorizer Scores for 5_TfidfVectorizer
++++++++++++++++++++++++++++++++++++++++
*****MNB*****
*****CONFUSION MATRIX*****
[[  179  1186  1513    52     1]
 [   77  2868  7598   279     2]
 [   18  1242 28695  1897    12]
 [    1   140  7680  5128   119]
 [    0    18  1326  2127   266]]
*****CLASSIFICATION REPORT*****
              precision    recall  f1-score   support

           0       0.65      0.06      0.11      2931
           1       0.53      0.26      0.35     10824
           2       0.61      0.90      0.73     31864
           3       0.54      0.39      0.45     13068
           4       0.67      0.07      0.13      3737

    accuracy                           0.59     62424
   macro avg       0.60      0.34      0.36     62424
weighted avg       0.59      0.59      0.54     62424

*****SCORES*****
0.5948993976675637
=====SVM=====
=====CONFUSION MATRIX=====
[[  916  1373   565    69     8]
 [  696  4666  4947   493    22]
 [  217  2507 26156  2827   157]
 [   25   364  5343  6334  1002]
 [    5    32   475  1962  1263]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.49      0.31      0.38      2931
           1       0.52      0.43      0.47     10824
           2       0.70      0.82      0.75     31864
           3       0.54      0.48      0.51     13068
           4       0.52      0.34      0.41      3737

    accuracy                           0.63     62424
   macro avg       0.55      0.48      0.51     62424
weighted avg       0.61      0.63      0.62     62424

=====CONFIDENCE SCORES=====
[-1.17972911 -0.41383963  0.29126027 -0.87403664 -1.04112626]
=====SCORES=====
0.6301262334999359
In [1]:
import matplotlib.pyplot as plt
import numpy as np; np.random.seed(1)
import pandas as pd
import seaborn as sns

data=pd.DataFrame({"VarX" : np.arange(10), 
                   'VarY1': np.random.rand(10),
                   'VarY2': np.random.rand(10),
                   'VarY3': np.random.rand(10)})

fig = plt.figure(figsize=(10,6))
sns.regplot(x='VarX', y='VarY1', data=data)
sns.regplot(x='VarX', y='VarY2', data=data)
sns.regplot(x='VarX', y='VarY3', data=data)
fig.legend(labels=['First','Second','Third'])
plt.show()
<Figure size 1000x600 with 1 Axes>
In [ ]: