## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
## ----- VECTORIZORS
unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )
unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)
def get_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
clf = classifier
clf.fit(X_train_vec,y_train)
print(clf.score(X_test_vec,y_test))
return clf
def get_model(X, y, labels, target_names, classifier, vec):
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
model = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
return model
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd
def return_features(vec, model):
for i,feature_probability in enumerate(model.coef_):
print('============ Sentiment Score: ', i)
df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
df3 = pd.concat([df1, df2], axis=1)
print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))