Sentiment Analysis: TextBlob + Vader¶

via this tutorial |10-6-19

from textblob import TextBlob
from IPython.display import display, HTML
import os
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

neg_k = get_data_from_files('AI_NEG/')
pos_k = get_data_from_files('AI_POS/')
neg_a = get_data_from_files('NEG/')
pos_a = get_data_from_files('POS/')
neg_cornell = get_data_from_files('neg_cornell/')
pos_cornell = get_data_from_files('pos_cornell/')
neg_dirty = get_data_from_files('NEG_dirty/')
pos_dirty = get_data_from_files('POS_dirty/')
neg_joker = get_data_from_files('NEG_JK/')
pos_joker = get_data_from_files('POS_JK/')

TEXT BLOB¶

def get_pn(num):
    return 'neg' if num < 0 else 'pos'

def get_sentiment(array, label):
    blobs = [[TextBlob(text), text] for text in array]
    return ([{'label': label,
              'prediction': get_pn(obj.sentiment.polarity),
              'sentiment': obj.sentiment.polarity,
              'length': len(text), 
              'excerpt': text[:50]} for obj,text in blobs])

CASE STUDY 1: Kendra's Data¶

display(pd.DataFrame(get_sentiment(neg_k, 'neg')))
display(pd.DataFrame(get_sentiment(pos_k, 'pos')))

CASE STUDY 2: Ami's Data¶

display(pd.DataFrame(get_sentiment(neg_a, 'neg')))
display(pd.DataFrame(get_sentiment(pos_a, 'pos')))

CASE STUDY 3: Cornell Data¶

df_n = pd.DataFrame(get_sentiment(neg_cornell, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_cornell, 'pos'))

import numpy as np
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 229
CORRECT PREDICT POS: 971

CASE STUDY 4: Dirty Data¶

df_n = pd.DataFrame(get_sentiment(neg_dirty, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_dirty, 'pos'))

df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 227
CORRECT PREDICT POS: 972

CASE STUDY 5: Joker Review Data¶

df_n = pd.DataFrame(get_sentiment(neg_joker, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_joker, 'pos'))

df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 64
CORRECT PREDICT POS: 114

VADER¶

def get_pn(num):
    return 'neg' if num < 0 else 'pos'

def get_vader_scores(array, label):
    vader_array = []
    for sentence in array:
        ss = sid.polarity_scores(sentence)
        vader_array.append({'label': label,
                            'prediction': get_pn(ss['compound']),
                            'compound': ss['compound'], 
                            'excerpt': sentence[:50]})
    return vader_array

neg_k

["WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICIAL INTELLIGENCE TOOK OUR JOBS.",
 "How can we trust Artificial Intelligence to drive our cars when they can't even hack a captcha?!",
 'I hate artificial intelligence!',
 'My dog is terrified by artificial intelligence!',
 'Artificial intelligence is going to melt the brains of our children!']

CASE STUDY 1: Kendra's Data¶

display(pd.DataFrame(get_vader_scores(neg_k, 'neg')))
display(pd.DataFrame(get_vader_scores(pos_k, 'pos')))

CASE STUDY 2: Ami's Data¶

display(pd.DataFrame(get_vader_scores(neg_a, 'neg')))
display(pd.DataFrame(get_vader_scores(pos_a, 'pos')))

CASE STUDY 3: Cornell Data¶

df_n = pd.DataFrame(get_vader_scores(neg_cornell, 'neg'))
df_p = pd.DataFrame(get_vader_scores(pos_cornell, 'pos'))

df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 445
CORRECT PREDICT POS: 828

CASE STUDY 4: Dirty Data¶

df_n = pd.DataFrame(get_vader_scores(neg_dirty, 'neg'))
df_p = pd.DataFrame(get_vader_scores(pos_dirty, 'pos'))

df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 454
CORRECT PREDICT POS: 824

CASE STUDY 5: Joker Review Data¶

df_n = pd.DataFrame(get_vader_scores(neg_joker, 'neg'))
df_p = pd.DataFrame(get_vader_scores(pos_joker, 'pos'))

df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

display(df_n)
display(df_p)

print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())
print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())

CORRECT PREDICT NEG: 64
CORRECT PREDICT POS: 114

NLTK with NaiveBayes¶

from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

def get_nltk_train_test(array, label, num_train):
    tokens = [get_tokens(sentence) for sentence in array]
    docs = [(sent, label) for sent in tokens]
    train_docs = docs[:num_train]
    test_docs = docs[num_train:len(array)]
    return [train_docs, test_docs]


def get_nltk_NB(NEG_DATA, POS_DATA, num_train):
    train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg', num_train)
    train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos', num_train)

    training_docs = train_neg + train_pos
    testing_docs = test_neg + test_pos

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    
    results = []
    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key,value))

CASE STUDY 1: Kendra's Data¶

get_nltk_NB(neg_k, pos_k, 4)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 1.0
F-measure [neg]: 1.0
F-measure [pos]: 1.0
Precision [neg]: 1.0
Precision [pos]: 1.0
Recall [neg]: 1.0
Recall [pos]: 1.0

CASE STUDY 2: Ami's Data¶

get_nltk_NB(neg_a, pos_a, 4)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.5
F-measure [neg]: 0.6666666666666666
F-measure [pos]: None
Precision [neg]: 0.5
Precision [pos]: None
Recall [neg]: 1.0
Recall [pos]: 0.0

CASE STUDY 3: Cornell's Data¶

get_nltk_NB(neg_cornell, pos_cornell, 800)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8125
F-measure [neg]: 0.8259860788863109
F-measure [pos]: 0.7967479674796748
Precision [neg]: 0.7705627705627706
Precision [pos]: 0.8698224852071006
Recall [neg]: 0.89
Recall [pos]: 0.735

CASE STUDY 4: Dirty Data¶

get_nltk_NB(neg_dirty, pos_dirty, 800)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.7775
F-measure [neg]: 0.7944572748267898
F-measure [pos]: 0.757493188010899
Precision [neg]: 0.7381974248927039
Precision [pos]: 0.8323353293413174
Recall [neg]: 0.86
Recall [pos]: 0.695

CASE STUDY 5: Joker Review Data¶

get_nltk_NB(neg_joker, pos_joker, 86)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.581081081081081
F-measure [neg]: 0.6593406593406593
F-measure [pos]: 0.456140350877193
Precision [neg]: 0.5555555555555556
Precision [pos]: 0.65
Recall [neg]: 0.8108108108108109
Recall [pos]: 0.35135135135135137

	label	prediction	sentiment	length	excerpt
0	neg	neg	-0.157143	76	WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...
1	neg	neg	-0.750000	96	How can we trust Artificial Intelligence to dr...
2	neg	neg	-0.775000	31	I hate artificial intelligence!
3	neg	neg	-0.750000	47	My dog is terrified by artificial intelligence!
4	neg	neg	-0.750000	68	Artificial intelligence is going to melt the b...

	label	prediction	sentiment	length	excerpt
0	pos	neg	-0.112500	65	My dog is excited by the advancements in artif...
1	pos	neg	-0.075000	133	I'm excited for my child to grow up and have t...
2	pos	neg	-0.125000	31	I love artificial intelligence!
3	pos	neg	-0.300000	121	Order my groceries, pay my taxes, take my kids...
4	pos	neg	-0.133333	116	I'm grateful every day that my child will like...

	label	prediction	sentiment	length	excerpt
0	neg	neg	-0.054577	3554	that's exactly how long the movie felt to me ....
1	neg	pos	0.025467	2929	" quest for camelot " is warner bros . ' firs...
2	neg	pos	0.003334	3365	so ask yourself what " 8mm " ( " eight millime...
3	neg	pos	0.022925	4418	synopsis : a mentally unstable man undergoing ...
4	neg	pos	0.043234	3911	capsule : in 2176 on the planet mars police ta...

	label	prediction	sentiment	length	excerpt
0	pos	pos	0.023663	4227	films adapted from comic books have had plenty...
1	pos	pos	0.131092	2421	you've got mail works alot better than it dese...
2	pos	pos	0.110626	6092	" jaws " is a rare film that grabs your atten...
3	pos	pos	0.103847	4096	every now and then a movie comes along from a ...
4	pos	neg	-0.070151	3898	moviemaking is a lot like being the general ma...

	label	prediction	sentiment	length	excerpt	accurate
0	neg	pos	0.026240	5953	bad . bad . \nbad . \nthat one word seems to p...	no
1	neg	pos	0.076040	3396	isn't it the ultimate sign of a movie's cinema...	no
2	neg	neg	-0.128733	2762	" gordy " is not a movie , it is a 90-minute-...	yes
3	neg	neg	-0.000485	3840	disconnect the phone line . \ndon't accept the...	yes
4	neg	pos	0.122770	2270	when robert forster found himself famous again...	no
...	...	...	...	...	...	...
995	neg	pos	0.145489	1945	synopsis : when a meteorite crashlands in the ...	no
996	neg	pos	0.102723	3116	it's now the anniversary of the slayings of ju...	no
997	neg	pos	0.042473	1755	coinciding with the emerging popularity of mov...	no
998	neg	neg	-0.048656	2826	and now the high-flying hong kong style of fil...	yes
999	neg	neg	-0.090655	4165	battlefield long , boring and just plain stupi...	yes

	label	prediction	sentiment	length	excerpt	accurate
0	pos	pos	0.221173	4662	assume nothing . \nthe phrase is perhaps one o...	yes
1	pos	pos	0.089736	3839	plot : derek zoolander is a male model . \nhe ...	yes
2	pos	pos	0.206743	9380	i actually am a fan of the original 1961 or so...	yes
3	pos	pos	0.141905	2407	a movie that's been as highly built up as the ...	yes
4	pos	pos	0.176332	1840	" good will hunting " is two movies in one : ...	yes
...	...	...	...	...	...	...
995	pos	pos	0.072815	2658	one of the funniest carry on movies and the th...	yes
996	pos	pos	0.102879	4196	i remember making a pact , right after `patch ...	yes
997	pos	pos	0.195097	2094	barely scrapping by playing at a nyc piano bar...	yes
998	pos	pos	0.117530	4575	if the current trends of hollywood filmmaking ...	yes
999	pos	neg	-0.013569	3870	capsule : the director of cure brings a weird ...	no

	label	prediction	sentiment	length	excerpt	accurate
0	neg	neg	-0.004665	3777	by starring in amy heckerlings clueless two ...	yes
1	neg	pos	0.119184	3639	i have little against remakes and updates of o...	no
2	neg	pos	0.100886	4247	i cant recall a previous film experience where...	no
3	neg	pos	0.097526	4308	the tagline for this film is : some houses ar...	no
4	neg	pos	0.048745	5175	warner brothers ; rated pg-13 ( mild violence ...	no
...	...	...	...	...	...	...
995	neg	pos	0.014624	4086	`the bachelor is one of the best terrible movi...	no
996	neg	pos	0.035911	3741	as a hot-shot defense attorney , kevin lomax (...	no
997	neg	pos	0.101395	2890	violence is bad . violence is ugly . violence ...	no
998	neg	pos	0.088523	4089	even though i have the utmost respect for rich...	no
999	neg	pos	0.074695	2433	an attempt at florida film noir , palmetto fai...	no

	label	prediction	sentiment	length	excerpt	accurate
0	pos	pos	0.134641	4584	for the first reel of girls town , you just ca...	yes
1	pos	pos	0.137134	3102	field of dreams almost defies description . al...	yes
2	pos	pos	0.181355	3521	meet joe black is your classic boy-meets-girl ...	yes
3	pos	pos	0.104101	2192	an indian runner was more than a courier . he ...	yes
4	pos	pos	0.204967	4955	every once in a while , when an exceptional fa...	yes
...	...	...	...	...	...	...
995	pos	pos	0.118713	4929	the laserman : somehow the title of writer-dir...	yes
996	pos	pos	0.150425	4264	i know what you did last summer , the first...	yes
997	pos	pos	0.121243	2374	buffalo ? 66 is a very rarely known movie that...	yes
998	pos	pos	0.130603	2508	time bandits , from director terry gilliam , i...	yes
999	pos	pos	0.011179	5355	warren beattys bulworth is a caustic politic...	yes

	label	prediction	sentiment	length	excerpt	accurate
0	neg	pos	0.152083	1734	Missed Opportunity\nI had been very excited t...	no
1	neg	neg	-0.001852	328	5/5 for Phoenix's acting..\nI don't think the...	yes
2	neg	pos	0.200000	145	Everyone praised an overrated movie.\nOverrat...	no
3	neg	neg	-0.038095	350	What idiotic FIlm\nI can say that Phoenix is ...	yes
4	neg	pos	0.126398	711	Terrible\nThe only thing good about this movi...	no
...	...	...	...	...	...	...
118	neg	neg	-0.290909	432	Boring and disappointing 😣\nGreat job acting ...	yes
119	neg	pos	0.164710	853	A masterclass in acting nothing more\nI don't...	no
120	neg	pos	0.126667	242	Not equal to the sum of its parts.\nDespite a...	no
121	neg	neg	-0.187500	128	Not real Joker\nThis movie is poorly done as ...	yes
122	neg	neg	-0.057436	1212	HAH HAAH HAAAH HAAAAH HAAAAAH HAAAAAAH HAAAAA...	yes

	label	prediction	sentiment	length	excerpt	accurate
0	pos	pos	0.107162	5554	funny like a clown\nGreetings again from the ...	yes
1	pos	pos	0.014881	473	Only certain people can relate\nThis is a mov...	yes
2	pos	pos	0.008294	2509	"That's Life."\nIn an era of cinema so satura...	yes
3	pos	pos	0.036939	4022	Best DC movie since The Dark Knight Rises\nDC...	yes
4	pos	neg	-0.017162	1430	unbelievable, unrelatable, a bit boring to be...	no
...	...	...	...	...	...	...
118	pos	pos	0.065000	353	Nerve-wracking, but in very uncomfortable way...	yes
119	pos	pos	0.035557	3501	Solid film but there are glaring problems\nOk...	yes
120	pos	pos	0.250203	510	Joker > Endgame\nNeed I say more? Everything ...	yes
121	pos	pos	0.003030	424	Absolutely not a 10\nStrong fanboy and hype r...	yes
122	pos	pos	0.117628	363	Overhyped, but it's alright\nIt's a good film...	yes

	label	prediction	compound	excerpt
0	neg	pos	0.5255	WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...
1	neg	pos	0.7712	How can we trust Artificial Intelligence to dr...
2	neg	neg	-0.2244	I hate artificial intelligence!
3	neg	neg	-0.2942	My dog is terrified by artificial intelligence!
4	neg	pos	0.5255	Artificial intelligence is going to melt the b...

	label	prediction	compound	excerpt
0	pos	pos	0.6705	My dog is excited by the advancements in artif...
1	pos	pos	0.8271	I'm excited for my child to grow up and have t...
2	pos	pos	0.8221	I love artificial intelligence!
3	pos	pos	0.8213	Order my groceries, pay my taxes, take my kids...
4	pos	pos	0.8402	I'm grateful every day that my child will like...

	label	prediction	compound	excerpt
0	neg	pos	0.7836	that's exactly how long the movie felt to me ....
1	neg	neg	-0.8481	" quest for camelot " is warner bros . ' firs...
2	neg	neg	-0.9753	so ask yourself what " 8mm " ( " eight millime...
3	neg	pos	0.6824	synopsis : a mentally unstable man undergoing ...
4	neg	neg	-0.9879	capsule : in 2176 on the planet mars police ta...

	label	prediction	compound	excerpt
0	pos	neg	-0.5887	films adapted from comic books have had plenty...
1	pos	pos	0.9964	you've got mail works alot better than it dese...
2	pos	pos	0.9868	" jaws " is a rare film that grabs your atten...
3	pos	pos	0.8825	every now and then a movie comes along from a ...
4	pos	neg	-0.3525	moviemaking is a lot like being the general ma...

	label	prediction	compound	excerpt	accurate
0	neg	pos	0.9695	bad . bad . \nbad . \nthat one word seems to p...	no
1	neg	pos	0.1722	isn't it the ultimate sign of a movie's cinema...	no
2	neg	neg	-0.9970	" gordy " is not a movie , it is a 90-minute-...	yes
3	neg	pos	0.9861	disconnect the phone line . \ndon't accept the...	no
4	neg	pos	0.7445	when robert forster found himself famous again...	no
...	...	...	...	...	...
995	neg	pos	0.9828	synopsis : when a meteorite crashlands in the ...	no
996	neg	pos	0.8979	it's now the anniversary of the slayings of ju...	no
997	neg	neg	-0.9371	coinciding with the emerging popularity of mov...	yes
998	neg	neg	-0.9923	and now the high-flying hong kong style of fil...	yes
999	neg	neg	-0.9837	battlefield long , boring and just plain stupi...	yes

	label	prediction	compound	excerpt	accurate
0	pos	pos	0.9985	assume nothing . \nthe phrase is perhaps one o...	yes
1	pos	pos	0.9853	plot : derek zoolander is a male model . \nhe ...	yes
2	pos	pos	0.9998	i actually am a fan of the original 1961 or so...	yes
3	pos	pos	0.9671	a movie that's been as highly built up as the ...	yes
4	pos	pos	0.9300	" good will hunting " is two movies in one : ...	yes
...	...	...	...	...	...
995	pos	pos	0.9913	one of the funniest carry on movies and the th...	yes
996	pos	pos	0.9985	i remember making a pact , right after `patch ...	yes
997	pos	pos	0.9964	barely scrapping by playing at a nyc piano bar...	yes
998	pos	pos	0.9975	if the current trends of hollywood filmmaking ...	yes
999	pos	neg	-0.9914	capsule : the director of cure brings a weird ...	no

	label	prediction	compound	excerpt	accurate
0	neg	neg	-0.9326	by starring in amy heckerlings clueless two ...	yes
1	neg	pos	0.8326	i have little against remakes and updates of o...	no
2	neg	pos	0.9491	i cant recall a previous film experience where...	no
3	neg	pos	0.9854	the tagline for this film is : some houses ar...	no
4	neg	neg	-0.8077	warner brothers ; rated pg-13 ( mild violence ...	yes
...	...	...	...	...	...
995	neg	pos	0.9927	`the bachelor is one of the best terrible movi...	no
996	neg	neg	-0.9803	as a hot-shot defense attorney , kevin lomax (...	yes
997	neg	neg	-0.3950	violence is bad . violence is ugly . violence ...	yes
998	neg	pos	0.9827	even though i have the utmost respect for rich...	no
999	neg	neg	-0.5308	an attempt at florida film noir , palmetto fai...	yes

	label	prediction	compound	excerpt	accurate
0	pos	neg	-0.9888	for the first reel of girls town , you just ca...	no
1	pos	pos	0.9885	field of dreams almost defies description . al...	yes
2	pos	pos	0.9806	meet joe black is your classic boy-meets-girl ...	yes
3	pos	neg	-0.9614	an indian runner was more than a courier . he ...	no
4	pos	pos	0.9992	every once in a while , when an exceptional fa...	yes
...	...	...	...	...	...
995	pos	pos	0.9920	the laserman : somehow the title of writer-dir...	yes
996	pos	neg	-0.9924	i know what you did last summer , the first...	no
997	pos	pos	0.9921	buffalo ? 66 is a very rarely known movie that...	yes
998	pos	pos	0.9574	time bandits , from director terry gilliam , i...	yes
999	pos	neg	-0.9947	warren beattys bulworth is a caustic politic...	no