##########################################
# NOTE: I'm toying with the idea of requiring the library just above
# when I use it so it makes more sense in context
##########################################
# import os
# import pandas as pd
# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.sentiment import SentimentAnalyzer
# from nltk.sentiment.util import *
# from nltk.probability import FreqDist
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sid = SentimentIntensityAnalyzer()
import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file)
results.append(f.read())
f.close()
return results
neg = get_data_from_files('../NEG_JK/')
pos = get_data_from_files('../POS_JK/')
import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
all_df
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
## Came back and added sentences for tokinization for "Summary experiment"
def get_sentence_tokens(review):
return sent_tokenize(review)
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)
def get_tokens(sentence):
tokens = word_tokenize(sentence)
clean_tokens = [word.lower() for word in tokens if word.isalpha()]
return clean_tokens
all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)
all_df
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
filtered_text = []
for word in sentence:
if word not in stop_words:
filtered_text.append(word)
return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)
all_df
from nltk.probability import FreqDist
def get_most_common(tokens):
fdist = FreqDist(tokens)
return fdist.most_common(12)
all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)
def get_most_common(tokens):
fdist = FreqDist(tokens)
return fdist.most_common(12)
all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)
def get_fdist(tokens):
return (FreqDist(tokens))
all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)
all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)
all_df
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def get_vader_score(review):
return sid.polarity_scores(review)
all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)
def separate_vader_score(vader_score, key):
return vader_score[key]
all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)
all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)
all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)
all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)
all_df[:5]
def get_weighted_freq_dist(review, freq_dist):
max_freq = max(freq_dist.values())
for word in freq_dist.keys():
freq_dist[word] = (freq_dist[word]/max_freq)
return freq_dist
all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)
def get_sentence_score(review, freq_dist):
sentence_scores = {}
for sent in review:
for word in nltk.word_tokenize(sent.lower()):
if word in freq_dist.keys():
if len(sent.split(' ')) < 30:
if sent not in sentence_scores.keys():
sentence_scores[sent] = freq_dist[word]
else:
sentence_scores[sent] += freq_dist[word]
return sentence_scores
all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)
def get_summary_sentences(sentence_scores):
sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
return ''.join(sent[0] for sent in sorted_sentences[:5])
all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)
summaries = all_df['summary_sentences'].tolist()
summaries[3]
all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)
all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)
all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)
all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)
all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)
def get_freq_words(freq_dist):
sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
return ' '.join(word[0] for word in sorted_words[:50])
all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)
all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)
all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)
all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)
all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)
all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
def get_NB(small_df, labels):
x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
all_df[:5]
small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu']) # 0.645
get_NB(small_df, all_df['PoN'])
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) # 0.59
get_NB(small_df, all_df['PoN'])
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum',
'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'])
small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd']) # 0.598
get_NB(small_df, all_df['PoN'])
compound
Vader Scores¶small_df = all_df.filter(['v_compound_fd','v_compound_sum', 'v_compound']) # 0.615
get_NB(small_df, all_df['PoN'])
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum',
'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd',
'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.613
get_NB(small_df, all_df['PoN'])
def get_freq_words(freq_dist):
sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
return ' '.join(word[0] for word in sorted_words[:50])
all_df['v_freq_words_unfil'] = all_df.apply(lambda x: get_freq_words(x['freq_dist_unfil']), axis=1)
all_df['vader_fd_all_unfil'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words_unfil']),axis=1)
all_df['v_compound_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'compound'),axis=1)
all_df['v_neg_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neg'),axis=1)
all_df['v_neu_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neu'),axis=1)
all_df['v_pos_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'pos'),axis=1)
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum',
'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd',
'v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf',
'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'])
small_df = all_df.filter(['v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf']) # 0.603
get_NB(small_df, all_df['PoN'])
summaries_pos = all_df[all_df['PoN'] == 'P']
summaries_neg = all_df[all_df['PoN'] == 'N']
summaries_pos_list = summaries_pos['summary_sentences'].tolist()
summaries_neg_list = summaries_neg['summary_sentences'].tolist()
summaries_pos_list[:1]
summaries_neg_list[:1]
summaries_neg_list[:1]
### VERSION 1
# all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
# unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
# sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# training_set = sentim_analyzer.apply_features(training_docs)
# test_set = sentim_analyzer.apply_features(testing_docs)
sentim_analyzer = SentimentAnalyzer()
def get_nltk_negs(tokens):
all_words_neg = sentim_analyzer.all_words([mark_negation(tokens)])
return all_words_neg
def get_unigram_feats(neg_tokens):
unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)
return unigram_feats
def get_bigram_feats(tokens):
ngrams = zip(*[tokens[i:] for i in range(2)])
return ["_".join(ngram) for ngram in ngrams]
all_df['nltk_negs'] = all_df.apply(lambda x: get_nltk_negs(x['tokens']), axis=1)
all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)
all_df['bigram_feats'] = all_df.apply(lambda x: get_bigram_feats(x['tokens']), axis=1)
all_df['bigram_feats_neg'] = all_df.apply(lambda x: get_bigram_feats(x['nltk_negs']), axis=1)
all_df[:5]
# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)
### VERSION 2
# all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
# unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
# sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
# training_set = sentim_analyzer.apply_features(training_docs)
# test_set = sentim_analyzer.apply_features(testing_docs)
sentim_analyzer = SentimentAnalyzer()
def get_nltk_data(tokens):
neg_tokens = sentim_analyzer.all_words([mark_negation(tokens)])
unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
return sentim_analyzer.apply_features(tokens)
# def get_unigram_feats(neg_tokens):
# return unigram_feats
nltk_df = pd.DataFrame()
nltk_df['nltk_data'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)
# all_df['nltk']
# all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)
# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)
all_df['nltk_all'] = 0
all_df['nltk_all']
all_df[:3]
from nltk.tokenize import casual_tokenize
from collections import Counter
all_df['bow_nosw'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)
all_df[:3]
# Keeping punctuation
# def diy_cleaner(review):
# both = review.split('\n')
# title = both[0]
# review = both[1]
# cleaned = title + '.' + title + '.' + review
# return cleaned.lower()
# Removing punctuation
# def diy_cleaner(review):
# both = review.split('\n')
# title = both[0]
# review = both[1]
# review = review.replace("\'",'')
# review = review.replace("'",'')
# review = review.replace(",",'')
# cleaned = title + ' ' + title + ' ' + ' '.join(review.split('.'))
# return cleaned.lower()
import re, string
def diy_cleaner(review):
both = review.split('\n')
title = both[0]
review = both[1]
review = review.replace("'","")
pattern = re.compile('[\W_]+')
review = pattern.sub(' ', review)
cleaned = title + ' ' + title + ' ' + review
return cleaned.lower()
all_df['diy_cleaner'] = all_df.apply(lambda x: diy_cleaner(x[0]), axis=1)
all_df['diy_cleaner'][0].tolist()
casual_tokenize
and (2) Counter
¶all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)
all_df[:3]
def get_bow_from_column(df, column):
all_column_data = ' '.join(df[column].tolist())
all_column_fd = Counter(all_column_data.split())
return all_column_fd
big_bow = get_bow_from_column(all_df, 'diy_cleaner')
big_bow_n = get_bow_from_column(all_df[all_df['PoN'] == 'N'], 'diy_cleaner')
big_bow_p = get_bow_from_column(all_df[all_df['PoN'] == 'P'], 'diy_cleaner')
big_bow_n.most_common(10)
big_bow_p.most_common(10)
# Wow this is unhelpful. Removing words < 3 characters like Professor Gates does!
def pruner(review):
clean_review = ' '.join([word for word in review.split() if len(word) > 3])
return clean_review
all_df['pruned'] = all_df.apply(lambda x: pruner(x['diy_cleaner']), axis=1)
big_bow = get_bow_from_column(all_df, 'pruned')
big_bow_n = get_bow_from_column(all_df[all_df['PoN'] == 'N'], 'pruned')
big_bow_p = get_bow_from_column(all_df[all_df['PoN'] == 'P'], 'pruned')
big_bow.most_common(10)
big_bow_n.most_common(10)
big_bow_p.most_common(10)
# bow_df = all_df['pruned']
# get_NB(small_df, all_df['PoN'])
all_df['pruned']
all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)
new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)
new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)
new_df = pd.DataFrame(all_df['bow_nosw'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)
all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)
all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)
all_bigrams = []
for review in all_df['bigram_feats']:
for bigram in review:
all_bigrams.append(bigram)
bigram_count = Counter(all_bigrams)
bigram_count.most_common(10)
What can we learn from the intersection of "positive" and "negative" words? Essentially creating a new "stopword" list of "words that frequently occur in both lists
big_bow = get_bow_from_column(all_df, 'pruned')
big_bow_n = get_bow_from_column(all_df[all_df['PoN'] == 'N'], 'pruned')
big_bow_p = get_bow_from_column(all_df[all_df['PoN'] == 'P'], 'pruned')
most_common_neg = [word[0] for word in big_bow_n.most_common(100)]
most_common_neg[:10]
most_common_pos = [word[0] for word in big_bow_p.most_common(100)]
most_common_pos[:10]
import numpy as np
print("Unique values in array1 that are not in array2:")
neg_notpos = np.setdiff1d(most_common_neg, most_common_pos)
neg_notpos
print("Unique values in array2 that are not in array1:")
pos_notneg = np.setdiff1d(most_common_pos, most_common_neg)
pos_notneg
print("Common values between two arrays:")
in_both = np.intersect1d(most_common_neg, most_common_pos)
print(len(in_both))
in_both
def get_common_words(num):
most_common_neg = [word[0] for word in big_bow_n.most_common(num)]
most_common_pos = [word[0] for word in big_bow_p.most_common(num)]
in_both = np.intersect1d(most_common_neg, most_common_pos)
neg_notpos = np.setdiff1d(most_common_neg, most_common_pos)
pos_notneg = np.setdiff1d(most_common_pos, most_common_neg)
return [len(in_both), len(neg_notpos), len(pos_notneg), len(in_both)/num, in_both, neg_notpos, pos_notneg]
common_100 = get_common_words(100)
common_200 = get_common_words(200)
common_300 = get_common_words(300)
common_500 = get_common_words(500)
common_1000 = get_common_words(1000)
common_100[:4]
common_200[:4]
common_300[:4]
common_500[:4]
common_1000[:4]
def get_only_polarized(tokens):
# return [token for token in tokens if token not in common_1000[4]] # 66
# return [token for token in tokens if token not in common_100[4]] # 70
return [token for token in tokens if token not in common_500[4]] # 70
all_df['no_shared_words'] = all_df.apply(lambda x: get_only_polarized(x['tokens']), axis=1)
all_df[:5]
all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)
def get_only_polarized_v2(tokens):
# return [token for token in tokens if token in common_1000[5]] # 54
# return [token for token in tokens if token not in common_1000[5]] # 59
# return [token for token in tokens if token not in common_1000[6]] # 60
return [token for token in tokens if token not in common_1000[6]] # 60
all_df['no_neg_words'] = all_df.apply(lambda x: get_only_polarized_v2(x['tokens']), axis=1)
all_df['bow_v7'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_neg_words']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v7'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)
all_df
big_bow_n.keys()
import wordcloud
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import seaborn as sns
import matplotlib.pyplot as plt
def create_word_cloud_with_mask(path_of_mask_image, dictionary,
max_num_words, title):
mask = np.array(Image.open(path_of_mask_image))
#creating the word cloud
word_cloud = WordCloud(background_color = "white",
max_words = max_num_words,
mask = mask, max_font_size = 125,
random_state = 1006)
word_cloud.generate_from_frequencies(dictionary)
#creating the coloring for the word cloud
image_colors = ImageColorGenerator(mask)
plt.figure(figsize = [8,8])
plt.imshow(word_cloud.recolor(color_func = image_colors),
interpolation = "bilinear")
plt.title(title)
sns.set_context("poster")
plt.axis("off")
return plt
# import sys
# print(sys.executable)
# mask = np.array(Image.open('../questionmark.png'))
# mask
create_word_cloud_with_mask('thumbup.png', big_bow_n, 750, "Pre-clean")
big_bow_n