VECTORIZATION (Pandas style!)
##########################################
# NOTE: I'm toying with the idea of requiring the library just above
# when I use it so it makes more sense in context
##########################################
# import os
# import pandas as pd
# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.sentiment import SentimentAnalyzer
# from nltk.sentiment.util import *
# from nltk.probability import FreqDist
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sid = SentimentIntensityAnalyzer()
import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file)
results.append(f.read())
f.close()
return results
# neg = get_data_from_files('../neg_cornell/')
# pos = get_data_from_files('../pos_cornell/')
# neg = get_data_from_files('../neg_hw4/')
# pos = get_data_from_files('../pos_hw4/')
pos = get_data_from_files('../hw4_lie_false/')
neg = get_data_from_files('../hw4_lie_true/')
import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
all_df[:3]
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
## Came back and added sentences for tokinization for "Summary experiment"
def get_sentence_tokens(review):
return sent_tokenize(review)
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)
def get_tokens(sentence):
tokens = word_tokenize(sentence)
clean_tokens = [word.lower() for word in tokens if word.isalpha()]
return clean_tokens
all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)
all_df[:3]
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
filtered_text = []
for word in sentence:
if word not in stop_words:
filtered_text.append(word)
return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)
all_df[:5]
from nltk.probability import FreqDist
def get_most_common(tokens):
fdist = FreqDist(tokens)
return fdist.most_common(12)
all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)
def get_most_common(tokens):
fdist = FreqDist(tokens)
return fdist.most_common(12)
all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)
def get_fdist(tokens):
return (FreqDist(tokens))
all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)
all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)
all_df[:3]
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def get_vader_score(review):
return sid.polarity_scores(review)
all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)
def separate_vader_score(vader_score, key):
return vader_score[key]
all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)
all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)
all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)
all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)
all_df[0][17]
def get_weighted_freq_dist(review, freq_dist):
try:
max_freq = max(freq_dist.values())
for word in freq_dist.keys():
freq_dist[word] = (freq_dist[word]/max_freq)
return freq_dist
except:
return 'nope'
all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)
def get_sentence_score(review, freq_dist):
sentence_scores = {}
for sent in review:
for word in nltk.word_tokenize(sent.lower()):
if word in freq_dist.keys():
if len(sent.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = freq_dist[word]
else:
sentence_scores[sent] += freq_dist[word]
return sentence_scores
all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)
def get_summary_sentences(sentence_scores):
sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
return ''.join(sent[0] for sent in sorted_sentences[:5])
all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)
summaries = all_df['summary_sentences'].tolist()
summaries[3]
all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)
all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)
all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)
all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)
all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)
def get_freq_words(freq_dist):
sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
return ' '.join(word[0] for word in sorted_words[:50])
all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)
all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)
all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)
all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)
all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)
all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import metrics
def get_NB(small_df, labels, no_negs):
x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
if no_negs:
mnnb = MultinomialNB()
mnnb.fit(x_train, y_train)
y_pred_mn = mnnb.predict(x_test)
print("Accuracy GNB:", metrics.accuracy_score(y_test, y_pred))
if no_negs:
print("Accuracy MNNB:", metrics.accuracy_score(y_test, y_pred_mn))
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB()
# clf.fit(x_train, y_train)
# print(clf.predict(x_train[2:3]))
small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu']) # 0.645
get_NB(small_df, all_df['PoN'], False)
small_df = all_df.filter(['v_pos', 'v_neu']) # 0.645
get_NB(small_df, all_df['PoN'], True)
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) # 0.59
get_NB(small_df, all_df['PoN'], False)
small_df = all_df.filter(['v_pos_sum','v_neu_sum']) # 0.59
get_NB(small_df, all_df['PoN'], True)
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum',
'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'], False)
small_df = all_df.filter(['v_pos_sum', 'v_neu_sum', 'v_pos', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'], True)
small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd']) # 0.598
get_NB(small_df, all_df['PoN'], False)
small_df = all_df.filter(['v_pos_fd', 'v_neu_fd']) # 0.598
get_NB(small_df, all_df['PoN'], True)
compound
Vader Scores¶small_df = all_df.filter(['v_compound_fd','v_compound_sum', 'v_compound']) # 0.615
get_NB(small_df, all_df['PoN'], False)
small_df = all_df.filter(['v_pos_fd','v_pos_sum', 'v_pos']) # 0.615
get_NB(small_df, all_df['PoN'], True)
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum',
'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd',
'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.613
get_NB(small_df, all_df['PoN'], False)
def get_freq_words(freq_dist):
sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
return ' '.join(word[0] for word in sorted_words[:50])
all_df['v_freq_words_unfil'] = all_df.apply(lambda x: get_freq_words(x['freq_dist_unfil']), axis=1)
all_df['vader_fd_all_unfil'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words_unfil']),axis=1)
all_df['v_compound_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'compound'),axis=1)
all_df['v_neg_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neg'),axis=1)
all_df['v_neu_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neu'),axis=1)
all_df['v_pos_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'pos'),axis=1)
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum',
'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd',
'v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf',
'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'], False)
small_df = all_df.filter(['v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf']) # 0.603
get_NB(small_df, all_df['PoN'], False)
summaries_pos = all_df[all_df['PoN'] == 'P']
summaries_neg = all_df[all_df['PoN'] == 'N']
summaries_pos_list = summaries_pos['summary_sentences'].tolist()
summaries_neg_list = summaries_neg['summary_sentences'].tolist()
summaries_pos_list[:1]
summaries_neg_list[:1]
### VERSION 1
# all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
# unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
# sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# training_set = sentim_analyzer.apply_features(training_docs)
# test_set = sentim_analyzer.apply_features(testing_docs)
sentim_analyzer = SentimentAnalyzer()
def get_nltk_negs(tokens):
all_words_neg = sentim_analyzer.all_words([mark_negation(tokens)])
return all_words_neg
def get_unigram_feats(neg_tokens):
unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)
return unigram_feats
all_df['nltk_negs'] = all_df.apply(lambda x: get_nltk_negs(x['tokens']), axis=1)
all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)
# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)
### VERSION 2
# all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
# unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
# sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# training_set = sentim_analyzer.apply_features(training_docs)
# test_set = sentim_analyzer.apply_features(testing_docs)
sentim_analyzer = SentimentAnalyzer()
def get_nltk_data(tokens):
# print(tokens)
neg_tokens = sentim_analyzer.all_words([mark_negation(tokens)])
unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# print(sentim_analyzer.apply_features(tokens))
return sentim_analyzer.apply_features(tokens)
# def get_unigram_feats(neg_tokens):
# return unigram_feats
nltk_df = pd.DataFrame()
nltk_df['nltk_data'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)
# all_df['nltk']
# all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)
# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)
# all_df['nltk_all'] = 0
nltk_df
all_df['nltk_negs']
from nltk.tokenize import casual_tokenize
from collections import Counter
all_df['bow_nosw'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)
all_df[:3]