## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string
from tabulate import tabulate
import numpy as np
# unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
vectorizers = [
CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english'),
CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english'),
# CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=LemmaTokenizer()),
# CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=my_tokenizer ),
# TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english'),
# TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, max_df=0.50, stop_words='english'),
# TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
]
def get_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
mnb_clf = MultinomialNB()
mnb_clf.fit(X_train_vec, y_train)
print('*****MNB*****')
y_pred = mnb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=labels)
print('*****CONFUSION MATRIX*****')
print(cm)
target_names = target_names
index = target_names
columns = target_names
cm_df = pd.DataFrame(cm,columns,index)
plt.figure(figsize=(16,10))
sns.heatmap(cm_df, annot=True, cmap="Blues")
plt.show
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
index = target_names
columns = target_names
cm_df = pd.DataFrame(cm,columns,index)
plt.figure(figsize=(16,10))
sns.heatmap(cm_df, annot=True, cmap="Blues")
plt.show
cm2 = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
# print('*****CLASSIFICATION REPORT*****')
# print(classification_report(y_test, y_pred, target_names=target_names))
report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
# print('TYPE', type(report))
# print(report['0'])
df = pd.DataFrame(report)
# print(pd.DataFrame(report).T['recall'])
print('BY SENTIMENT')
print(tabulate(df[df.columns[0:4]], tablefmt="simple", headers=df.columns, floatfmt=".2f"))
print('BY PERFORMANCE')
print(tabulate(df[df.columns[5:8]], tablefmt="simple", headers=df.columns[5:8], floatfmt=".2f"))
return(pd.DataFrame(report).T)
# print('*****SCORES*****')
# print(mnb_clf.score(X_test_vec, y_test))
def run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
svm_clf = LinearSVC(C=1)
svm_clf.fit(X_train_vec,y_train)
print('=====SVM=====')
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=labels)
print('=====CONFUSION MATRIX=====')
print(cm)
target_names = target_names
print('=====CLASSIFICATION REPORT=====')
print(classification_report(y_test, y_pred, target_names=target_names))
report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
return(pd.DataFrame(report).T)
svm_confidence_scores = svm_clf.decision_function(X_test_vec)
print('=====CONFIDENCE SCORES=====')
print(svm_confidence_scores[0])
print('=====SCORES=====')
print(svm_clf.score(X_test_vec,y_test))
def do_the_thing(X,y,labels, target_names):
for vec in vectorizers:
# print(vec)
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
# run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])
import seaborn as sns
import matplotlib.pyplot as plt
m_df = mnb_report.copy()
s_df = svm_report.copy()
def barplot_single_report(report):
accuracy = report[5:6]['support'].values
just_sent = report[:5]
just_sent = just_sent[just_sent.columns[0:3]]
just_sent.reset_index(inplace=True)
df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
plt.figure(figsize=(10, 6))
sns.barplot(x="index",
y="score_value",
hue="sentiment",
data=df);
plt.title('ACCURACY: ' + str(accuracy))
plt.show()
# barplot_single_report(m_df)
def barplot_both_report(m_df, s_df):
m_df2 = m_df
s_df2 = s_df
m_df2['model'] = 'mnb'
s_df2['model'] = 'svm'
df = m_df2[:5].append(s_df2[:5])
df.reset_index(inplace=True)
df2 = df[['index','f1-score','model']]
# df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
plt.figure(figsize=(10, 6))
sns.barplot(x="index",
y="f1-score",
hue="model",
data=df2);
plt.title('Comparing MNB & SVM')
plt.show()
# barplot_both_report(m_df, s_df)
def get_report(X,y,labels, target_names):
vec = vectorizers[0]
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
mnb_report = run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
svm_report = run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
barplot_single_report(mnb_report)
barplot_single_report(svm_report)
barplot_both_report(mnb_report, svm_report)
# return mnb_report, svm_report
import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
# do_the_thing(X,y,[0,1,2,3,4],['0','1','2','3','4'])
# mnb_report, svm_report = get_report(X,y,[0,1,2,3,4],['0','1','2','3','4'])
get_report(X,y,[0,1,2,3,4],['0','1','2','3','4'])
import seaborn as sns
import matplotlib.pyplot as plt
m_df = mnb_report.copy()
s_df = svm_report.copy()
def barplot_single_report(report):
accuracy = report[5:6]['support'].values
just_sent = report[:5]
just_sent = just_sent[just_sent.columns[0:3]]
just_sent.reset_index(inplace=True)
df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
sns.barplot(x="index",
y="score_value",
hue="sentiment",
data=df);
plt.title('ACCURACY: ' + str(accuracy))
barplot_single_report(m_df)
def barplot_both_report(m_df, s_df):
m_df2 = m_df
s_df2 = s_df
m_df2['model'] = 'mnb'
s_df2['model'] = 'svm'
df = m_df2[:5].append(s_df2[:5])
df.reset_index(inplace=True)
df2 = df[['index','f1-score','model']]
# df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
sns.barplot(x="index",
y="f1-score",
hue="model",
data=df2);
barplot_both_report(m_df, s_df)
barplot_single_report(s_df)
m_df[5:6]['support']
m_df2 = m_df
s_df2 = s_df
m_df2['model'] = 'mnb'
s_df2['model'] = 'svm'
df = m_df2[:5].append(s_df2[:5])
df.reset_index(inplace=True)
df2 = df[['index','f1-score','model']]
df2
# df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
sns.barplot(x="index",
y="f1-score",
hue="model",
data=df2);
def barplot_both_report(m_df, s_df):
m_df2 = m_df
s_df2 = s_df
m_df2['model'] = 'mnb'
s_df2['model'] = 'svm'
df = m_df2[:5].append(s_df2[:5])
df.reset_index(inplace=True)
df2 = df[['index','f1-score','model']]
# df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
sns.barplot(x="index",
y="f1-score",
hue="model",
data=df2);
barplot_both_report(m_df, s_df)
m_df = mnb_report.copy()
s_df = svm_report.copy()
# mnb_report.T
# df = mnb_report.copy()
import seaborn as sns
import matplotlib.pyplot as plt
# df = df.drop('support', axis=1)
just_sent = s_df[:5]
just_sent = just_sent[just_sent.columns[0:3]]
just_sent.reset_index(inplace=True)
just_sent
df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
df
# sns.barplot(data = df)
sns.barplot(x="index",
y="score_value",
hue="sentiment",
data=df);
mnb_report
df = mnb_report.copy().drop('support', axis=1)
df
# df2 = agree_df.groupby(['agree_factor', 'PoN']).count()
# df2.reset_index(inplace=True)
mnb_report['model'] = 'mnb'
svm_report['model'] = 'svm'
all_df = mnb_report.append(svm_report)
df2 = all_df.groupby(['model'])
df2 = pd.DataFrame(df2)
# sns.barplot(data = df2)
df2
df3 = mnb_report.T.reset_index()
# sns.barplot(data = df3)
df3 = mnb_report.T.reset_index()[:3]
df3
df4 = pd.melt(df3, id_vars="index", var_name="sentiment", value_name="score_value")
# sns.barplot(data = df4
sns.barplot(x="index",
y="score_value",
hue="sentiment",
data=df4);
# df4 = pd.melt(df3, id_vars="", var_names="", value_name=" ")
sns.barplot(x="sentiment",
y="score_value",
hue="index",
data=df4);
df5 = df4[df4.columns[0:3]]
df5