## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )
unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )
bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)
def get_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
clf = classifier
clf.fit(X_train_vec,y_train)
y_pred = clf.predict(X_test_vec)
report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
score = clf.score(X_test_vec,y_test)
return clf, score, report
def get_model(X, y, labels, target_names, classifier, vec):
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
return model, score, report
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd
# import pandas as pd
# train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
# y=train['Sentiment'].values
# X=train['Phrase'].values
import pandas as pd
import numpy as np
df = pd.read_csv('../death_row_discritized.csv')
def to_string(tokens):
try:
return " ".join(eval(tokens))
except:
return "error"
column_name = 'time_spent'
df['statement_string'] = df.apply(lambda x: to_string(x['last_statement']), axis=1)
# y=df['vic_kid'].values
y=df[column_name].values
y = [value if type(value) == str else y[0] for value in y]
y_labels = list(set(y))
X=df['statement_string'].values
y_labels
def return_features(vec, model):
for i,feature_probability in enumerate(model.coef_):
print('============', column_name,': ', y_labels[i])
df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
df3 = pd.concat([df1, df2], axis=1)
print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))
def update_big_df(big_df, new_row):
big_df.append(new_row)
df = pd.DataFrame(big_df)
df = df.drop_duplicates()
return df
big_df = []
vec = unigram_bool_cv_v1
classifier = mnb
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df
vec = unigram_bool_cv_v1
classifier = svm
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df
NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not.
vec = unigram_bool_cv_v2
classifier = mnb
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
df
vec = unigram_bool_cv_v2
classifier = svm
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
df
vec = unigram_cv
classifier = mnb
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
df
vec = unigram_cv
classifier = svm
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
df
vec = bigram_cv
classifier = mnb
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})
classifier = svm
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
df
df
vec = bigram_cv_v2
classifier = mnb
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})
classifier = svm
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})
df
vec = unigram_tv
classifier = mnb
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})
classifier = svm
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})
df
vec = unigram_tv_v2
classifier = mnb
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})
classifier = svm
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})
df
vec = bigram_tv
classifier = mnb
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})
classifier = svm
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})
df
vec = bigram_tv_v2
classifier = mnb
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})
classifier = svm
model, score, report = get_model(X,y,y_labels, y_labels, classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})
df