## =======================================================
## TOKENIZING
## =======================================================
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
## =======================================================
## VECTORIZING
## =======================================================
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
## ----- VECTORIZORS
unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )
unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b' )
bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')
bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')
unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')
bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', token_pattern=r'(?u)\b[a-zA-Z]{2,}\b')
## =======================================================
## MODELING
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import BernoulliNB, MultinomialNB
## ----- CLASSIFIERS
mnb = MultinomialNB()
svm = LinearSVC(C=1)
def get_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):
clf = classifier
clf.fit(X_train_vec,y_train)
y_pred = clf.predict(X_test_vec)
report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)
score = clf.score(X_test_vec,y_test)
return clf, score, report
def get_model(X, y, labels, target_names, classifier, vec):
X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)
model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)
return model, score, report
## =======================================================
## VISUALIZING
## =======================================================
from tabulate import tabulate
import pandas as pd
def return_features(vec, model):
for i,feature_probability in enumerate(model.coef_):
print('============ Sentiment Score: ', i)
df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])
df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])
df3 = pd.concat([df1, df2], axis=1)
print(tabulate(df3, tablefmt="fancy_grid", headers=["Most","Likely","Least","Likely"], floatfmt=".2f"))
def update_big_df(big_df, new_row):
big_df.append(new_row)
df = pd.DataFrame(big_df)
df = df.drop_duplicates()
return df
# import pandas as pd
train=pd.read_csv("kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
big_df = []
vec = unigram_bool_cv_v1
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})
df
vec = unigram_bool_cv_v1
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})
df
NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not.
vec = unigram_bool_cv_v2
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})
df
# return_features(vec, model)
vec = unigram_bool_cv_v2
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
# return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})
df
vec = unigram_cv
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})
df
vec = unigram_cv
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})
df
vec = bigram_cv
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})
vec = bigram_cv
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})
df
vec = bigram_cv_v2
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})
df
vec = unigram_tv
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})
df
vec = unigram_tv_v2
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})
df
vec = bigram_tv
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})
df
vec = bigram_tv_v2
classifier = mnb
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})
classifier = svm
model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)
return_features(vec, model)
df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})
df
pred_vec = bigram_cv_v2
test = pd.read_csv("kaggle-sentiment/test.tsv", delimiter='\t')
k_id = test['PhraseId'].values
k_text = test['Phrase'].values
k_vec = bigram_cv_v2.transform(k_text)
k_vec
def get_kaggle_test_train_vec(X,y,vectorizer):
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
return X_train_vec, X_test_vec, y_train, y_test
def do_the_kaggle(X,y,vec):
X_train_vec, X_test_vec, y_train, y_test = get_kaggle_test_train_vec(X,y,vec)
svm_clf = LinearSVC(C=1)
prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)
kaggle_submission = zip(k_id, prediction)
outf=open('kaggle_submission_linearSVC_v5.csv', 'w')
outf.write('PhraseId,Sentiment\n')
for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\n')
outf.close()
print('prediction complete')
do_the_kaggle(X,y,bigram_cv_v2)
df