## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
## ----- VECTORIZORS
unigram_bool_cv = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)
def get_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
clf = classifier
clf.fit(X_train_vec,y_train)
print(clf.score(X_test_vec,y_test))
def get_model(X, y, labels, target_names, classifier, vec):
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
model = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
return model