## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
## =======================================================
## PREPROCESSING
## =======================================================
# FIRST - removing anything with 3 or fewer characters
# def my_preprocessor(doc):
# # print('PREPROCESSING!!!!!')
# if len(doc) > 3:
# return(doc)
# else:
# return('none')
def my_preprocessor(doc):
# print('PREPROCESSING!!!!!')
if len(doc) > 2:
return(doc)
else:
return('empty')
## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )
unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )
bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
## ----- VECTORIZERS with PREPROCESSING
unigram_tv_v3 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english',
preprocessor=my_preprocessor, token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
bigram_tv_v3 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5,
preprocessor=my_preprocessor, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
bigram_tv_v4 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2),
preprocessor=my_preprocessor, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
bigram_tv_v5 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=3,
preprocessor=my_preprocessor, stop_words='english',
token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)
def get_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
clf = classifier
clf.fit(X_train_vec,y_train)
y_pred = clf.predict(X_test_vec)
report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
score = clf.score(X_test_vec,y_test)
return clf, score, report
def get_model(X, y, labels, target_names, classifier, vec):
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
return model, score, report
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd
def return_features(vec, model):
for i,feature_probability in enumerate(model.coef_):
print('============ Sentiment Score: ', i)
df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
df3 = pd.concat([df1, df2], axis=1)
print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))
def update_big_df(big_df, new_row):
big_df.append(new_row)
df = pd.DataFrame(big_df)
df = df.drop_duplicates()
return df
# import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
big_df = []
train=pd.read_csv("../HW2/hw7_data_sentiment.csv")
train.head()
# y=train['labels'].values
# X=train['pruned'].values
def remove_na(string):
# print(type(string))
if type(string) == str:
return string
else:
return "empty"
train['pruned_2'] = train.apply(lambda x: remove_na(x['pruned']), axis= 1)
y=train['labels'].values
X=train['pruned_2'].values
big_df = []
train.head()
vec = unigram_tv_v3
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df
vec = bigram_tv_v3
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df
vec = bigram_tv_v3
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df
vec = bigram_tv_v4
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df
vec = bigram_tv_v5
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df
# classifier vectorizer score
# 0 mnb V1 0.583317
# 1 mnb V1 0.594499
# 2 svm V1 0.629566
# 3 svm V1 0.636662
# vec = unigram_bool_cv_v1
# classifier = mnb
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
# df
# vec = unigram_bool_cv_v1
# classifier = svm
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
# df
NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not.
# vec = unigram_bool_cv_v2
# classifier = mnb
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
# df
# vec = unigram_bool_cv_v2
# classifier = svm
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
# df
# vec = unigram_cv
# classifier = mnb
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
# df
# vec = unigram_cv
# classifier = svm
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
# df
# vec = bigram_cv
# classifier = mnb
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})
# classifier = svm
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
# df
df
vec = bigram_cv_v2
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})
df
# vec = unigram_tv
# classifier = mnb
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})
# classifier = svm
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})
df
# vec = unigram_tv_v2
# classifier = mnb
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})
# classifier = svm
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})
df
# vec = bigram_tv
# classifier = mnb
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})
# classifier = svm
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})
df
# vec = bigram_tv_v2
# classifier = mnb
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})
# classifier = svm
# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})
df
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
# pred_vec = bigram_tv_v3 # 60.4
# pred_vec = bigram_tv_v4 # 60.569
pred_vec = bigram_tv_v4 # removing words < 2 60.584
test = pd.read_csv("kaggle-sentiment/test.tsv", delimiter='\t')
k_id = test['PhraseId']
k_text = test['Phrase']
# k_vec = bigram_tv_v3.transform(k_text)
# k_vec
def get_kaggle_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
# X_test_vec = vectorizer.transform(X_test)
return X_train_vec, y_train,
def do_the_kaggle(X,y,vec):
X_train_vec, y_train = get_kaggle_test_train_vec(X,y,vec)
svm_clf = LinearSVC(C=1)
k_vec = pred_vec.transform(k_text)
print(len(X), X_train_vec.shape, k_vec.shape)
prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)
kaggle_submission = zip(k_id, prediction)
outf=open('kaggle_submission_linearSVC_v10.csv', 'w')
outf.write('PhraseId,Sentiment\n')
for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')
outf.close()
print('prediction complete')
do_the_kaggle(X,y,pred_vec)