from textblob import TextBlob
from IPython.display import display, HTML
import os
import pandas as pd
import numpy as np
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file)
results.append(f.read())
f.close()
return results
# HW 1
neg_k = get_data_from_files('AI_NEG/')
pos_k = get_data_from_files('AI_POS/')
neg_a = get_data_from_files('NEG/')
pos_a = get_data_from_files('POS/')
# HW2
neg_cornell = get_data_from_files('neg_cornell/')
pos_cornell = get_data_from_files('pos_cornell/')
# HW3
neg_dirty = get_data_from_files('NEG_dirty/')
pos_dirty = get_data_from_files('POS_dirty/')
neg_joker = get_data_from_files('NEG_JK/')
pos_joker = get_data_from_files('POS_JK/')
# HW4
neg_hw4 = get_data_from_files('neg_hw4/')
pos_hw4 = get_data_from_files('pos_hw4/')
# HW4
false_lie_hw4 = get_data_from_files('hw4_lie_false/')
true_lie_hw4 = get_data_from_files('hw4_lie_true/')
def get_pn(num):
return 'neg' if num < 0 else 'pos'
def get_sentiment(array, label):
blobs = [[TextBlob(text), text] for text in array]
return ([{'label': label,
'prediction': get_pn(obj.sentiment.polarity),
'sentiment': obj.sentiment.polarity,
'length': len(text),
'excerpt': text[:50]} for obj,text in blobs])
df_n = pd.DataFrame(get_sentiment(neg_k, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_k, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n)
display(df_p)
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_sentiment(neg_a, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_a, 'pos'))
import numpy as np
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n)
display(df_p)
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_sentiment(neg_cornell, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_cornell, 'pos'))
import numpy as np
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_sentiment(neg_dirty, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_dirty, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_sentiment(neg_joker, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_joker, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_sentiment(neg_hw4, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_hw4, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_sentiment(false_lie_hw4, 'neg'))
df_p = pd.DataFrame(get_sentiment(true_lie_hw4, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
def get_pn(num):
return 'neg' if num < 0 else 'pos'
def get_vader_scores(array, label):
vader_array = []
for sentence in array:
ss = sid.polarity_scores(sentence)
vader_array.append({'label': label,
'prediction': get_pn(ss['compound']),
'compound': ss['compound'],
'excerpt': sentence[:50]})
return vader_array
neg_k
df_n = pd.DataFrame(get_vader_scores(neg_k, 'neg'))
df_p = pd.DataFrame(get_vader_scores(pos_k, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n)
display(df_p)
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_vader_scores(neg_a, 'neg'))
df_p = pd.DataFrame(get_vader_scores(pos_a, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n)
display(df_p)
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_vader_scores(neg_cornell, 'neg'))
df_p = pd.DataFrame(get_vader_scores(pos_cornell, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_vader_scores(neg_dirty, 'neg'))
df_p = pd.DataFrame(get_vader_scores(pos_dirty, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_vader_scores(neg_joker, 'neg'))
df_p = pd.DataFrame(get_vader_scores(pos_joker, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_vader_scores(neg_hw4, 'neg'))
df_p = pd.DataFrame(get_vader_scores(pos_hw4, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
df_n = pd.DataFrame(get_vader_scores(false_lie_hw4, 'neg'))
df_p = pd.DataFrame(get_vader_scores(true_lie_hw4, 'pos'))
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')
display(df_n[:5])
display(df_p[:5])
sum_correct_n = (df_n['accurate']=='yes').sum()
sum_correct_p = (df_p['accurate']=='yes').sum()
print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))
print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
def get_tokens(sentence):
tokens = word_tokenize(sentence)
clean_tokens = [word.lower() for word in tokens if word.isalpha()]
return clean_tokens
def get_nltk_train_test(array, label, num_train):
tokens = [get_tokens(sentence) for sentence in array]
docs = [(sent, label) for sent in tokens]
train_docs = docs[:num_train]
test_docs = docs[num_train:len(array)]
return [train_docs, test_docs]
def get_nltk_NB(NEG_DATA, POS_DATA, num_train):
train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg', num_train)
train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos', num_train)
training_docs = train_neg + train_pos
testing_docs = test_neg + test_pos
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
results = []
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
print('{0}: {1}'.format(key,value))
get_nltk_NB(neg_k, pos_k, 4)
get_nltk_NB(neg_a, pos_a, 4)
get_nltk_NB(neg_cornell, pos_cornell, 800)
get_nltk_NB(neg_dirty, pos_dirty, 800)
get_nltk_NB(neg_joker, pos_joker, 86)
get_nltk_NB(neg_hw4, pos_hw4, 32)
get_nltk_NB(false_lie_hw4, true_lie_hw4, 32)