## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file)
results.append(f.read())
f.close()
return results
## =======================================================
## MACHINE LEARNING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
# unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
# unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
# bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
# unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
# bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
vectorizers = [
CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english'),
CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english'),
CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english'),
TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english'),
TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, max_df=0.50, stop_words='english'),
TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
]
def get_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
mnb_clf = MultinomialNB()
mnb_clf.fit(X_train_vec, y_train)
print('*****MNB*****')
y_pred = mnb_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=labels)
print('*****CONFUSION MATRIX*****')
print(cm)
target_names = target_names
print('*****CLASSIFICATION REPORT*****')
print(classification_report(y_test, y_pred, target_names=target_names))
print('*****SCORES*****')
print(mnb_clf.score(X_test_vec, y_test))
def run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names):
svm_clf = LinearSVC(C=1)
svm_clf.fit(X_train_vec,y_train)
print('=====SVM=====')
y_pred = svm_clf.predict(X_test_vec)
cm=confusion_matrix(y_test, y_pred, labels=labels)
print('=====CONFUSION MATRIX=====')
print(cm)
target_names = target_names
print('=====CLASSIFICATION REPORT=====')
print(classification_report(y_test, y_pred, target_names=target_names))
svm_confidence_scores = svm_clf.decision_function(X_test_vec)
print('=====CONFIDENCE SCORES=====')
print(svm_confidence_scores[0])
print('=====SCORES=====')
print(svm_clf.score(X_test_vec,y_test))
def do_the_thing(X,y,labels, target_names):
for i,vec in enumerate(vectorizers):
vec_type = str(vec).split('(')[0]
print('++'* 20)
print('Vectorizer Scores for '+ str(i)+ '_' + vec_type)
print('++'* 20)
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
run_mnb(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)
run_svm(X_train_vec, X_test_vec, y_train, y_test, labels, target_names)