In [ ]:
 
In [194]:
## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from nltk.corpus import stopwords

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string

from tabulate import tabulate
import numpy as np

# unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')

vectorizers = [
    CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english'),
    CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
#     CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english'),
#     CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=LemmaTokenizer()),
#     CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=my_tokenizer ),
#     TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english'),
#     TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, max_df=0.50, stop_words='english'),
#     TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
]

def get_test_train_vec(X,y,vectorizer):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
    X_train_vec = vectorizer.fit_transform(X_train)
    X_test_vec = vectorizer.transform(X_test)
    return X_train_vec, X_test_vec, y_train, y_test

def run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    mnb_clf = MultinomialNB()
    mnb_clf.fit(X_train_vec, y_train)
    print('*****MNB*****')
    y_pred = mnb_clf.predict(X_test_vec)
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    print('*****CONFUSION MATRIX*****')
    print(cm)
    
    target_names = target_names
    index = target_names
    columns = target_names 
    cm_df = pd.DataFrame(cm,columns,index)                      
    plt.figure(figsize=(16,10))  
    sns.heatmap(cm_df, annot=True, cmap="Blues")
    plt.show
    
    cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    index = target_names
    columns = target_names 
    cm_df = pd.DataFrame(cm,columns,index)                      
    plt.figure(figsize=(16,10))  
    sns.heatmap(cm_df, annot=True, cmap="Blues")
    plt.show
    
    cm2 = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
#     print('*****CLASSIFICATION REPORT*****')
#     print(classification_report(y_test, y_pred, target_names=target_names))
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
#     print('TYPE', type(report))
#     print(report['0'])
    df = pd.DataFrame(report)
#     print(pd.DataFrame(report).T['recall'])
    print('BY SENTIMENT')
    print(tabulate(df[df.columns[0:4]], tablefmt="simple", headers=df.columns, floatfmt=".2f"))
    print('BY PERFORMANCE')
    print(tabulate(df[df.columns[5:8]], tablefmt="simple", headers=df.columns[5:8], floatfmt=".2f"))
    return(pd.DataFrame(report).T)
    
#     print('*****SCORES*****')
#     print(mnb_clf.score(X_test_vec, y_test))
    
def run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
    svm_clf = LinearSVC(C=1)
    svm_clf.fit(X_train_vec,y_train)
    print('=====SVM=====')
    y_pred = svm_clf.predict(X_test_vec)
    cm=confusion_matrix(y_test, y_pred, labels=labels)
    print('=====CONFUSION MATRIX=====')
    print(cm)

    target_names = target_names
    print('=====CLASSIFICATION REPORT=====')
    print(classification_report(y_test, y_pred, target_names=target_names))
    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
    return(pd.DataFrame(report).T)
    svm_confidence_scores = svm_clf.decision_function(X_test_vec)
    print('=====CONFIDENCE SCORES=====')
    print(svm_confidence_scores[0])
    print('=====SCORES=====')
    print(svm_clf.score(X_test_vec,y_test))
    
def do_the_thing(X,y,labels, target_names):
    for vec in vectorizers:
#         print(vec)
        X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
        run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
#         run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
In [195]:
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])
*****MNB*****
*****CONFUSION MATRIX*****
[[  733  1264   817   106    11]
 [  602  4132  5411   649    30]
 [  246  2397 25756  3226   239]
 [   19   454  5580  6248   767]
 [    1    54   725  1972   985]]
BY SENTIMENT
0                1         2         3         4
---------  -------  --------  --------  --------
precision     0.46      0.50      0.67      0.51
recall        0.25      0.38      0.81      0.48
f1-score      0.32      0.43      0.73      0.49
support    2931.00  10824.00  31864.00  13068.00
BY PERFORMANCE
             accuracy    macro avg    weighted avg
---------  ----------  -----------  --------------
precision        0.61         0.53            0.59
recall           0.61         0.44            0.61
f1-score         0.61         0.47            0.59
support          0.61     62424.00        62424.00
*****MNB*****
*****CONFUSION MATRIX*****
[[  742  1276   797   105    11]
 [  614  4126  5397   655    32]
 [  248  2385 25756  3239   236]
 [   19   456  5570  6253   770]
 [    1    53   729  1977   977]]
BY SENTIMENT
0                1         2         3         4
---------  -------  --------  --------  --------
precision     0.46      0.50      0.67      0.51
recall        0.25      0.38      0.81      0.48
f1-score      0.33      0.43      0.73      0.49
support    2931.00  10824.00  31864.00  13068.00
BY PERFORMANCE
             accuracy    macro avg    weighted avg
---------  ----------  -----------  --------------
precision        0.61         0.52            0.59
recall           0.61         0.44            0.61
f1-score         0.61         0.47            0.59
support          0.61     62424.00        62424.00
In [159]:
import seaborn as sns
import matplotlib.pyplot as plt
m_df = mnb_report.copy()
s_df = svm_report.copy()

def barplot_single_report(report):
    accuracy = report[5:6]['support'].values
    just_sent = report[:5]
    just_sent = just_sent[just_sent.columns[0:3]]
    just_sent.reset_index(inplace=True)
    df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.barplot(x="index",
               y="score_value",
               hue="sentiment",
               data=df);
    plt.title('ACCURACY: ' + str(accuracy))
    plt.show()

# barplot_single_report(m_df)

def barplot_both_report(m_df, s_df):
    m_df2 = m_df
    s_df2 = s_df
    m_df2['model'] = 'mnb'
    s_df2['model'] = 'svm'
    df = m_df2[:5].append(s_df2[:5])
    df.reset_index(inplace=True)
    df2 = df[['index','f1-score','model']]
    # df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
    plt.figure(figsize=(10, 6))
    sns.barplot(x="index",
               y="f1-score",
               hue="model",
               data=df2);
    plt.title('Comparing MNB & SVM')
    plt.show()
# barplot_both_report(m_df, s_df)

def get_report(X,y,labels, target_names):
    vec = vectorizers[0]
    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
    mnb_report = run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
    svm_report = run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
    barplot_single_report(mnb_report)
    barplot_single_report(svm_report)
    barplot_both_report(mnb_report, svm_report)
#     return mnb_report, svm_report

    
In [160]:
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
# do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])
# mnb_report, svm_report = get_report(X,y,[0,1,2,3,4],['0','1','2','3','4'])
get_report(X,y,[0,1,2,3,4],['0','1','2','3','4'])
*****MNB*****
*****CONFUSION MATRIX*****
[[  733  1264   817   106    11]
 [  602  4132  5411   649    30]
 [  246  2397 25756  3226   239]
 [   19   454  5580  6248   767]
 [    1    54   725  1972   985]]
*****CLASSIFICATION REPORT*****
              precision    recall  f1-score   support

           0       0.46      0.25      0.32      2931
           1       0.50      0.38      0.43     10824
           2       0.67      0.81      0.73     31864
           3       0.51      0.48      0.49     13068
           4       0.48      0.26      0.34      3737

    accuracy                           0.61     62424
   macro avg       0.53      0.44      0.47     62424
weighted avg       0.59      0.61      0.59     62424

TYPE <class 'dict'>
{'precision': 0.45783885071830105, 'recall': 0.25008529512111904, 'f1-score': 0.323477493380406, 'support': 2931}
0               0.250085
1               0.381744
2               0.808310
3               0.478114
4               0.263580
accuracy        0.606401
macro avg       0.436367
weighted avg    0.606401
Name: recall, dtype: float64
=====SVM=====
=====CONFUSION MATRIX=====
[[  913  1229   696    79    14]
 [  705  4094  5472   527    26]
 [  190  2111 27063  2324   176]
 [   33   394  6011  5568  1062]
 [    3    51   582  1775  1326]]
=====CLASSIFICATION REPORT=====
              precision    recall  f1-score   support

           0       0.50      0.31      0.38      2931
           1       0.52      0.38      0.44     10824
           2       0.68      0.85      0.76     31864
           3       0.54      0.43      0.48     13068
           4       0.51      0.35      0.42      3737

    accuracy                           0.62     62424
   macro avg       0.55      0.46      0.49     62424
weighted avg       0.60      0.62      0.60     62424

In [137]:
import seaborn as sns
import matplotlib.pyplot as plt
m_df = mnb_report.copy()
s_df = svm_report.copy()

def barplot_single_report(report):
    accuracy = report[5:6]['support'].values
    just_sent = report[:5]
    just_sent = just_sent[just_sent.columns[0:3]]
    just_sent.reset_index(inplace=True)
    df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
    sns.barplot(x="index",
               y="score_value",
               hue="sentiment",
               data=df);
    plt.title('ACCURACY: ' + str(accuracy))

barplot_single_report(m_df)
    
def barplot_both_report(m_df, s_df):
    m_df2 = m_df
    s_df2 = s_df
    m_df2['model'] = 'mnb'
    s_df2['model'] = 'svm'
    df = m_df2[:5].append(s_df2[:5])
    df.reset_index(inplace=True)
    df2 = df[['index','f1-score','model']]
    # df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
    sns.barplot(x="index",
               y="f1-score",
               hue="model",
               data=df2);
barplot_both_report(m_df, s_df)
    
In [138]:
barplot_single_report(s_df)
In [132]:
m_df[5:6]['support']
Out[132]:
accuracy    0.606401
Name: support, dtype: float64
In [ ]:
 
In [151]:
m_df2 = m_df
s_df2 = s_df
m_df2['model'] = 'mnb'
s_df2['model'] = 'svm'
df = m_df2[:5].append(s_df2[:5])
df.reset_index(inplace=True)
df2 = df[['index','f1-score','model']]
df2
# df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
sns.barplot(x="index",
           y="f1-score",
           hue="model",
           data=df2);
In [152]:
def barplot_both_report(m_df, s_df):
    m_df2 = m_df
    s_df2 = s_df
    m_df2['model'] = 'mnb'
    s_df2['model'] = 'svm'
    df = m_df2[:5].append(s_df2[:5])
    df.reset_index(inplace=True)
    df2 = df[['index','f1-score','model']]
    # df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
    sns.barplot(x="index",
               y="f1-score",
               hue="model",
               data=df2);
barplot_both_report(m_df, s_df)
In [109]:
m_df = mnb_report.copy()
s_df = svm_report.copy()
# mnb_report.T
# df = mnb_report.copy()
import seaborn as sns
import matplotlib.pyplot as plt


# df = df.drop('support', axis=1)

just_sent = s_df[:5]

just_sent = just_sent[just_sent.columns[0:3]]
just_sent.reset_index(inplace=True)
just_sent

df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
df
# sns.barplot(data = df)
sns.barplot(x="index",
           y="score_value",
           hue="sentiment",
           data=df);
In [74]:
mnb_report
df = mnb_report.copy().drop('support', axis=1)
df
Out[74]:
precision recall f1-score model
0 0.457839 0.250085 0.323477 mnb
1 0.497771 0.381744 0.432105 mnb
2 0.672674 0.808310 0.734281 mnb
3 0.512089 0.478114 0.494519 mnb
4 0.484744 0.263580 0.341480 mnb
accuracy 0.606401 0.606401 0.606401 mnb
macro avg 0.525023 0.436367 0.465172 mnb
weighted avg 0.587392 0.606401 0.588889 mnb
In [ ]:
# df2 = agree_df.groupby(['agree_factor', 'PoN']).count()
# df2.reset_index(inplace=True)
In [63]:
mnb_report['model'] = 'mnb'
svm_report['model'] = 'svm'
all_df = mnb_report.append(svm_report)
df2 = all_df.groupby(['model'])
df2 = pd.DataFrame(df2)
# sns.barplot(data = df2)
df2
Out[63]:
0 1
0 mnb precision recall f1-score ...
1 svm precision recall f1-score ...
In [69]:
df3 = mnb_report.T.reset_index()
In [98]:
 
In [100]:
# sns.barplot(data = df3)
df3 = mnb_report.T.reset_index()[:3]
df3
df4 = pd.melt(df3, id_vars="index", var_name="sentiment", value_name="score_value")
# sns.barplot(data = df4

sns.barplot(x="index",
           y="score_value",
           hue="sentiment",
           data=df4);
# df4 = pd.melt(df3, id_vars="", var_names="", value_name=" ")
In [93]:
sns.barplot(x="sentiment",
           y="score_value",
           hue="index",
           data=df4);
In [94]:
df5 = df4[df4.columns[0:3]]
In [95]:
df5
Out[95]:
index sentiment score_value
0 precision 0 0.457839
1 recall 0 0.250085
2 f1-score 0 0.323477
3 precision 1 0.497771
4 recall 1 0.381744
5 f1-score 1 0.432105
6 precision 2 0.672674
7 recall 2 0.80831
8 f1-score 2 0.734281
9 precision 3 0.512089
10 recall 3 0.478114
11 f1-score 3 0.494519
12 precision 4 0.484744
13 recall 4 0.26358
14 f1-score 4 0.34148
15 precision accuracy 0.606401
16 recall accuracy 0.606401
17 f1-score accuracy 0.606401
18 precision macro avg 0.525023
19 recall macro avg 0.436367
20 f1-score macro avg 0.465172
21 precision weighted avg 0.587392
22 recall weighted avg 0.606401
23 f1-score weighted avg 0.588889
In [ ]: