Sentiment Analysis: TextBlob + Vader¶

via this tutorial |10-6-19

from textblob import TextBlob
from IPython.display import display, HTML
import os
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

neg_k = get_data_from_files('AI_NEG/')
pos_k = get_data_from_files('AI_POS/')
neg_a = get_data_from_files('NEG/')
pos_a = get_data_from_files('POS/')
neg_cornell = get_data_from_files('neg_cornell/')
pos_cornell = get_data_from_files('pos_cornell/')
neg_joker = get_data_from_files('NEG_JK/')
pos_joker = get_data_from_files('POS_JK/')

TEXT BLOB¶

def get_pn(num):
    return 'neg' if num < 0 else 'pos'

def get_sentiment(array, label):
    blobs = [[TextBlob(text), text] for text in array]
    return ([{'label': label,
              'prediction': get_pn(obj.sentiment.polarity),
              'sentiment': obj.sentiment.polarity,
              'length': len(text), 
              'excerpt': text[:50]} for obj,text in blobs])

CASE STUDY 1: Kendra's Data¶

display(pd.DataFrame(get_sentiment(neg_k, 'neg')))
display(pd.DataFrame(get_sentiment(pos_k, 'pos')))

CASE STUDY 2: Ami's Data¶

display(pd.DataFrame(get_sentiment(neg_a, 'neg')))
display(pd.DataFrame(get_sentiment(pos_a, 'pos')))

CASE STUDY 3: Cornell Data¶

df_n = pd.DataFrame(get_sentiment(neg_cornell, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_cornell, 'pos'))

import numpy as np
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 229
CORRECT PREDICT POS: 971

CASE STUDY 4: Joker Review Data¶

df_n = pd.DataFrame(get_sentiment(neg_joker, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_joker, 'pos'))

df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 64
CORRECT PREDICT POS: 114

VADER¶

def get_pn(num):
    return 'neg' if num < 0 else 'pos'

def get_vader_scores(array, label):
    vader_array = []
    for sentence in array:
        ss = sid.polarity_scores(sentence)
        vader_array.append({'label': label,
                            'prediction': get_pn(ss['compound']),
                            'compound': ss['compound'], 
                            'excerpt': sentence[:50]})
    return vader_array

neg_k

["WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICIAL INTELLIGENCE TOOK OUR JOBS.",
 "How can we trust Artificial Intelligence to drive our cars when they can't even hack a captcha?!",
 'I hate artificial intelligence!',
 'My dog is terrified by artificial intelligence!',
 'Artificial intelligence is going to melt the brains of our children!']

CASE STUDY 1: Kendra's Data¶

display(pd.DataFrame(get_vader_scores(neg_k, 'neg')))
display(pd.DataFrame(get_vader_scores(pos_k, 'pos')))

CASE STUDY 2: Ami's Data¶

display(pd.DataFrame(get_vader_scores(neg_a, 'neg')))
display(pd.DataFrame(get_vader_scores(pos_a, 'pos')))

CASE STUDY 3: Cornell Data¶

df_n = pd.DataFrame(get_sentiment(neg_cornell, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_cornell, 'pos'))

df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 229
CORRECT PREDICT POS: 971

CASE STUDY 4: Joker Review Data¶

df_n = pd.DataFrame(get_sentiment(neg_joker, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_joker, 'pos'))

df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 64
CORRECT PREDICT POS: 114

NLTK with NaiveBayes¶

from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

def get_nltk_train_test(array, label, num_train):
    tokens = [get_tokens(sentence) for sentence in array]
    docs = [(sent, label) for sent in tokens]
    train_docs = docs[:num_train]
    test_docs = docs[num_train:len(array)]
    return [train_docs, test_docs]


def get_nltk_NB(NEG_DATA, POS_DATA, num_train):
    train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg', num_train)
    train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos', num_train)

    training_docs = train_neg + train_pos
    testing_docs = test_neg + test_pos

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    
    results = []
    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key,value))

CASE STUDY 1: Kendra's Data¶

get_nltk_NB(neg_k, pos_k, 4)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 1.0
F-measure [neg]: 1.0
F-measure [pos]: 1.0
Precision [neg]: 1.0
Precision [pos]: 1.0
Recall [neg]: 1.0
Recall [pos]: 1.0

CASE STUDY 2: Ami's Data¶

get_nltk_NB(neg_a, pos_a, 4)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.5
F-measure [neg]: 0.6666666666666666
F-measure [pos]: None
Precision [neg]: 0.5
Precision [pos]: None
Recall [neg]: 1.0
Recall [pos]: 0.0

CASE STUDY 3: Cornell's Data¶

get_nltk_NB(neg_cornell, pos_cornell, 800)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8125
F-measure [neg]: 0.8259860788863109
F-measure [pos]: 0.7967479674796748
Precision [neg]: 0.7705627705627706
Precision [pos]: 0.8698224852071006
Recall [neg]: 0.89
Recall [pos]: 0.735

get_nltk_NB(neg_joker, pos_joker, 86)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.581081081081081
F-measure [neg]: 0.6593406593406593
F-measure [pos]: 0.456140350877193
Precision [neg]: 0.5555555555555556
Precision [pos]: 0.65
Recall [neg]: 0.8108108108108109
Recall [pos]: 0.35135135135135137

	label	prediction	sentiment	length	excerpt
0	neg	pos	0.152083	1734	Missed Opportunity\nI had been very excited t...
1	neg	neg	-0.001852	328	5/5 for Phoenix's acting..\nI don't think the...
2	neg	pos	0.200000	145	Everyone praised an overrated movie.\nOverrat...
3	neg	neg	-0.038095	350	What idiotic FIlm\nI can say that Phoenix is ...
4	neg	pos	0.126398	711	Terrible\nThe only thing good about this movi...
...	...	...	...	...	...
118	neg	neg	-0.290909	432	Boring and disappointing 😣\nGreat job acting ...
119	neg	pos	0.164710	853	A masterclass in acting nothing more\nI don't...
120	neg	pos	0.126667	242	Not equal to the sum of its parts.\nDespite a...
121	neg	neg	-0.187500	128	Not real Joker\nThis movie is poorly done as ...
122	neg	neg	-0.057436	1212	HAH HAAH HAAAH HAAAAH HAAAAAH HAAAAAAH HAAAAA...

	label	prediction	sentiment	length	excerpt
0	pos	pos	0.107162	5554	funny like a clown\nGreetings again from the ...
1	pos	pos	0.014881	473	Only certain people can relate\nThis is a mov...
2	pos	pos	0.008294	2509	"That's Life."\nIn an era of cinema so satura...
3	pos	pos	0.036939	4022	Best DC movie since The Dark Knight Rises\nDC...
4	pos	neg	-0.017162	1430	unbelievable, unrelatable, a bit boring to be...
...	...	...	...	...	...
118	pos	pos	0.065000	353	Nerve-wracking, but in very uncomfortable way...
119	pos	pos	0.035557	3501	Solid film but there are glaring problems\nOk...
120	pos	pos	0.250203	510	Joker > Endgame\nNeed I say more? Everything ...
121	pos	pos	0.003030	424	Absolutely not a 10\nStrong fanboy and hype r...
122	pos	pos	0.117628	363	Overhyped, but it's alright\nIt's a good film...

	label	prediction	sentiment	length	excerpt
0	neg	neg	-0.054577	3554	that's exactly how long the movie felt to me ....
1	neg	pos	0.025467	2929	" quest for camelot " is warner bros . ' firs...
2	neg	pos	0.003334	3365	so ask yourself what " 8mm " ( " eight millime...
3	neg	pos	0.022925	4418	synopsis : a mentally unstable man undergoing ...
4	neg	pos	0.043234	3911	capsule : in 2176 on the planet mars police ta...

	label	prediction	sentiment	length	excerpt
0	pos	pos	0.023663	4227	films adapted from comic books have had plenty...
1	pos	pos	0.131092	2421	you've got mail works alot better than it dese...
2	pos	pos	0.110626	6092	" jaws " is a rare film that grabs your atten...
3	pos	pos	0.103847	4096	every now and then a movie comes along from a ...
4	pos	neg	-0.070151	3898	moviemaking is a lot like being the general ma...

	label	prediction	sentiment	length	excerpt	accurate
0	neg	pos	0.026240	5953	bad . bad . \nbad . \nthat one word seems to p...	no
1	neg	pos	0.076040	3396	isn't it the ultimate sign of a movie's cinema...	no
2	neg	neg	-0.128733	2762	" gordy " is not a movie , it is a 90-minute-...	yes
3	neg	neg	-0.000485	3840	disconnect the phone line . \ndon't accept the...	yes
4	neg	pos	0.122770	2270	when robert forster found himself famous again...	no
...	...	...	...	...	...	...
995	neg	pos	0.145489	1945	synopsis : when a meteorite crashlands in the ...	no
996	neg	pos	0.102723	3116	it's now the anniversary of the slayings of ju...	no
997	neg	pos	0.042473	1755	coinciding with the emerging popularity of mov...	no
998	neg	neg	-0.048656	2826	and now the high-flying hong kong style of fil...	yes
999	neg	neg	-0.090655	4165	battlefield long , boring and just plain stupi...	yes

	label	prediction	sentiment	length	excerpt	accurate
0	pos	pos	0.221173	4662	assume nothing . \nthe phrase is perhaps one o...	yes
1	pos	pos	0.089736	3839	plot : derek zoolander is a male model . \nhe ...	yes
2	pos	pos	0.206743	9380	i actually am a fan of the original 1961 or so...	yes
3	pos	pos	0.141905	2407	a movie that's been as highly built up as the ...	yes
4	pos	pos	0.176332	1840	" good will hunting " is two movies in one : ...	yes
...	...	...	...	...	...	...
995	pos	pos	0.072815	2658	one of the funniest carry on movies and the th...	yes
996	pos	pos	0.102879	4196	i remember making a pact , right after `patch ...	yes
997	pos	pos	0.195097	2094	barely scrapping by playing at a nyc piano bar...	yes
998	pos	pos	0.117530	4575	if the current trends of hollywood filmmaking ...	yes
999	pos	neg	-0.013569	3870	capsule : the director of cure brings a weird ...	no

	label	prediction	compound	excerpt
0	neg	pos	0.5255	WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...
1	neg	pos	0.7712	How can we trust Artificial Intelligence to dr...
2	neg	neg	-0.2244	I hate artificial intelligence!
3	neg	neg	-0.2942	My dog is terrified by artificial intelligence!
4	neg	pos	0.5255	Artificial intelligence is going to melt the b...

	label	prediction	compound	excerpt
0	pos	pos	0.6705	My dog is excited by the advancements in artif...
1	pos	pos	0.8271	I'm excited for my child to grow up and have t...
2	pos	pos	0.8221	I love artificial intelligence!
3	pos	pos	0.8213	Order my groceries, pay my taxes, take my kids...
4	pos	pos	0.8402	I'm grateful every day that my child will like...

	label	prediction	compound	excerpt
0	neg	pos	0.7836	that's exactly how long the movie felt to me ....
1	neg	neg	-0.8481	" quest for camelot " is warner bros . ' firs...
2	neg	neg	-0.9753	so ask yourself what " 8mm " ( " eight millime...
3	neg	pos	0.6824	synopsis : a mentally unstable man undergoing ...
4	neg	neg	-0.9879	capsule : in 2176 on the planet mars police ta...

	label	prediction	compound	excerpt
0	pos	neg	-0.5887	films adapted from comic books have had plenty...
1	pos	pos	0.9964	you've got mail works alot better than it dese...
2	pos	pos	0.9868	" jaws " is a rare film that grabs your atten...
3	pos	pos	0.8825	every now and then a movie comes along from a ...
4	pos	neg	-0.3525	moviemaking is a lot like being the general ma...