## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string
from tabulate import tabulate
import numpy as np
# unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
vectorizers = [
CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english'),
CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english'),
CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english'),
# CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=LemmaTokenizer()),
# CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', tokenizer=my_tokenizer ),
TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english'),
TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, max_df=0.50, stop_words='english'),
TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
]
def get_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def make_pretty_cm(y_test, y_pred, labels, target_names):
cm=confusion_matrix(y_test, y_pred, labels=labels)
plt.figure(figsize=(18,9))
target_names = target_names
index = target_names
columns = target_names
cm_df = pd.DataFrame(cm,columns,index)
plt.subplot(1, 2, 1)
sns.heatmap(cm_df, annot=True, cmap="Blues")
cm_n = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm_df = pd.DataFrame(cm_n,columns,index)
plt.subplot(1, 2, 2)
sns.heatmap(cm_df, annot=True, cmap="Blues")
plt.show()
import seaborn as sns
import matplotlib.pyplot as plt
# m_df = mnb_report.copy()
# s_df = svm_report.copy()
def barplot_all(report):
report.drop('support', inplace=True, axis=1)
report = report.T.reset_index()
# report = report.T
# report.reset_index(inplace=True)
# df5 = df4[df4.columns[0:3]]
df4 = pd.melt(report, id_vars="index", var_name="sentiment", value_name="score_value")
plt.figure(figsize=(10, 6))
sns.barplot(x="index",
y="score_value",
hue="sentiment",
data=df4);
plt.show()
def barplot_single_report(report):
accuracy = report[5:6]['support'].values
just_sent = report[:5]
just_sent = just_sent[just_sent.columns[0:3]]
just_sent.reset_index(inplace=True)
df = pd.melt(just_sent, id_vars="index", var_name="sentiment", value_name="score_value")
plt.figure(figsize=(10, 6))
sns.barplot(x="index",
y="score_value",
hue="sentiment",
data=df);
plt.title('ACCURACY: ' + str(accuracy))
plt.show()
def barplot_both_report(m_df, s_df):
m_df2 = m_df
s_df2 = s_df
m_df2['model'] = 'mnb'
s_df2['model'] = 'svm'
df = m_df2[:5].append(s_df2[:5])
df.reset_index(inplace=True)
df2 = df[['index','f1-score','model']]
# df = pd.melt(df, id_vars="index", var_name="sentiment", value_name="score_value")
plt.figure(figsize=(10, 6))
sns.barplot(x="index",
y="f1-score",
hue="model",
data=df2);
plt.title('Comparing MNB & SVM')
plt.show()
def run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
print('----'*10)
print('##MNB')
print('----'*10)
mnb_clf = MultinomialNB()
mnb_clf.fit(X_train_vec, y_train)
y_pred = mnb_clf.predict(X_test_vec)
make_pretty_cm(y_test, y_pred, labels, target_names)
report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
df = pd.DataFrame(report)
print('======'*10)
print('BY SENTIMENT')
print('======'*10)
print(tabulate(df[df.columns[0:4]], tablefmt="fancy_grid", headers=df.columns, floatfmt=".2f"))
print('======'*10)
print('BY PERFORMANCE')
print('======'*10)
print(tabulate(df[df.columns[5:8]], tablefmt="fancy_grid", headers=df.columns[5:8], floatfmt=".2f"))
barplot_single_report(pd.DataFrame(report).T)
barplot_all(pd.DataFrame(report).T)
return pd.DataFrame(report).T, mnb_clf
def run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
print('----'*10)
print('##SVM')
print('----'*10)
svm_clf = LinearSVC(C=1)
svm_clf.fit(X_train_vec,y_train)
y_pred = svm_clf.predict(X_test_vec)
target_names = target_names
make_pretty_cm(y_test, y_pred, labels, target_names)
report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
df = pd.DataFrame(report)
print('======'*10)
print('BY SENTIMENT')
print('======'*10)
print(tabulate(df[df.columns[0:4]], tablefmt="fancy_grid", headers=df.columns, floatfmt=".2f"))
print('======'*10)
print('BY PERFORMANCE')
print('======'*10)
print(tabulate(df[df.columns[5:8]], tablefmt="fancy_grid", headers=df.columns[5:8], floatfmt=".2f"))
barplot_single_report(pd.DataFrame(report).T)
barplot_all(pd.DataFrame(report).T)
return pd.DataFrame(report).T, svm_clf
def get_features(vec, thingy):
feature_ranks = sorted(zip(thingy.coef_[0], vec.get_feature_names()))
## get the 10 features that are best indicators of very negative sentiment (they are at the bottom of the ranked list)
very_negative_10 = feature_ranks[-10:]
print("Very negative words")
for i in range(0, len(very_negative_10)):
print(very_negative_10[i])
print()
## get 10 features that are least relevant to "very negative" sentiment (they are at the top of the ranked list)
not_very_negative_10 = feature_ranks[:10]
print("not very negative words")
for i in range(0, len(not_very_negative_10)):
print(not_very_negative_10[i])
print()
def do_the_thing(X,y,labels, target_names):
all_reports = []
for i,vec in enumerate(vectorizers):
params = vec.get_params()
df = pd.DataFrame([params]).T
vec_type = str(vec).split('(')[0]
title = str(i)+ '_' + vec_type
print(title)
print(tabulate(df, tablefmt="fancy_grid", headers=df.columns, floatfmt=".2f"))
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
mnb_report, mnb_clf = run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
# svm_report, svm_clf = run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
# barplot_both_report(mnb_report, svm_report)
print('MNB FEATURES******************************')
get_features(vec, mnb_clf)
# print('SVM FEATURES******************************')
# get_features(vec, svm_clf)
# all_reports.append({ i : {'mnb': mnb_report, 'svm': svm_report} })
return all_reports