Sentiment Analysis: TextBlob + Vader¶

via this tutorial |10-6-19

from textblob import TextBlob
from IPython.display import display, HTML
import os
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

neg_k = get_data_from_files('AI_NEG/')
pos_k = get_data_from_files('AI_POS/')
neg_a = get_data_from_files('NEG/')
pos_a = get_data_from_files('POS/')
neg_cornell = get_data_from_files('neg_cornell/')
pos_cornell = get_data_from_files('pos_cornell/')

TEXT BLOB¶

def get_pn(num):
    return 'neg' if num < 0 else 'pos'

def get_sentiment(array, label):
    blobs = [[TextBlob(text), text] for text in array]
    return ([{'label': label,
              'prediction': get_pn(obj.sentiment.polarity),
              'sentiment': obj.sentiment.polarity,
              'length': len(text), 
              'excerpt': text[:50]} for obj,text in blobs])

CASE STUDY 1: Kendra's Data¶

display(pd.DataFrame(get_sentiment(neg_k, 'neg')))
display(pd.DataFrame(get_sentiment(pos_k, 'pos')))

CASE STUDY 2: Ami's Data¶

display(pd.DataFrame(get_sentiment(neg_a, 'neg')))
display(pd.DataFrame(get_sentiment(pos_a, 'pos')))

CASE STUDY 3: Cornell Data¶

df_n = pd.DataFrame(get_sentiment(neg_cornell, 'neg'))
df_p = pd.DataFrame(get_sentiment(pos_cornell, 'pos'))

import numpy as np
df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')
df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')

print((df_n['accurate']=='yes').sum())
display(df_n)

print((df_p['accurate']=='yes').sum())
display(df_p)

229

971

VADER¶

def get_vader_scores(array, label):
    vader_array = []
    for sentence in array:
        ss = sid.polarity_scores(sentence)
        vader_array.append({'label': label, 'compound': ss['compound'], 'excerpt': sentence[:50]})
    return vader_array

CASE STUDY 1: Kendra's Data¶

display(pd.DataFrame(get_vader_scores(neg_k, 'neg')))
display(pd.DataFrame(get_vader_scores(pos_k, 'pos')))

CASE STUDY 2: Ami's Data¶

display(pd.DataFrame(get_vader_scores(neg_a, 'neg')))
display(pd.DataFrame(get_vader_scores(pos_a, 'pos')))

NLTK with NaiveBayes¶

from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

def get_nltk_train_test(array, label):
    tokens = [get_tokens(sentence) for sentence in array]
    docs = [(sent, label) for sent in tokens]
#     train_docs = docs[:4]
#     test_docs = docs[4:5]
    train_docs = docs[:800]
    test_docs = docs[800:1000]
    return [train_docs, test_docs]


def get_nltk_NB(NEG_DATA, POS_DATA):
    train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg')
    train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos')

    training_docs = train_neg + train_pos
    testing_docs = test_neg + test_pos

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key,value))

CASE STUDY 1: Kendra's Data¶

get_nltk_NB(neg_k, pos_k)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 1.0
F-measure [neg]: 1.0
F-measure [pos]: 1.0
Precision [neg]: 1.0
Precision [pos]: 1.0
Recall [neg]: 1.0
Recall [pos]: 1.0

CASE STUDY 2: Ami's Data¶

get_nltk_NB(neg_a, pos_a)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.5
F-measure [neg]: 0.6666666666666666
F-measure [pos]: None
Precision [neg]: 0.5
Precision [pos]: None
Recall [neg]: 1.0
Recall [pos]: 0.0

CASE STUDY 3: Cornell's Data¶

get_nltk_NB(neg_cornell, pos_cornell)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8125
F-measure [neg]: 0.8259860788863109
F-measure [pos]: 0.7967479674796748
Precision [neg]: 0.7705627705627706
Precision [pos]: 0.8698224852071006
Recall [neg]: 0.89
Recall [pos]: 0.735

	label	compound	excerpt
0	neg	0.5255	WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...
1	neg	0.7712	How can we trust Artificial Intelligence to dr...
2	neg	-0.2244	I hate artificial intelligence!
3	neg	-0.2942	My dog is terrified by artificial intelligence!
4	neg	0.5255	Artificial intelligence is going to melt the b...

	label	compound	excerpt
0	pos	0.6705	My dog is excited by the advancements in artif...
1	pos	0.8271	I'm excited for my child to grow up and have t...
2	pos	0.8221	I love artificial intelligence!
3	pos	0.8213	Order my groceries, pay my taxes, take my kids...
4	pos	0.8402	I'm grateful every day that my child will like...

	label	compound	excerpt
0	neg	0.7836	that's exactly how long the movie felt to me ....
1	neg	-0.8481	" quest for camelot " is warner bros . ' firs...
2	neg	-0.9753	so ask yourself what " 8mm " ( " eight millime...
3	neg	0.6824	synopsis : a mentally unstable man undergoing ...
4	neg	-0.9879	capsule : in 2176 on the planet mars police ta...

	label	compound	excerpt
0	pos	-0.5887	films adapted from comic books have had plenty...
1	pos	0.9964	you've got mail works alot better than it dese...
2	pos	0.9868	" jaws " is a rare film that grabs your atten...
3	pos	0.8825	every now and then a movie comes along from a ...
4	pos	-0.3525	moviemaking is a lot like being the general ma...

	label	prediction	sentiment	length	excerpt
0	neg	neg	-0.157143	76	WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...
1	neg	neg	-0.750000	96	How can we trust Artificial Intelligence to dr...
2	neg	neg	-0.775000	31	I hate artificial intelligence!
3	neg	neg	-0.750000	47	My dog is terrified by artificial intelligence!
4	neg	neg	-0.750000	68	Artificial intelligence is going to melt the b...

	label	prediction	sentiment	length	excerpt
0	pos	neg	-0.112500	65	My dog is excited by the advancements in artif...
1	pos	neg	-0.075000	133	I'm excited for my child to grow up and have t...
2	pos	neg	-0.125000	31	I love artificial intelligence!
3	pos	neg	-0.300000	121	Order my groceries, pay my taxes, take my kids...
4	pos	neg	-0.133333	116	I'm grateful every day that my child will like...

	label	prediction	sentiment	length	excerpt
0	neg	neg	-0.054577	3554	that's exactly how long the movie felt to me ....
1	neg	pos	0.025467	2929	" quest for camelot " is warner bros . ' firs...
2	neg	pos	0.003334	3365	so ask yourself what " 8mm " ( " eight millime...
3	neg	pos	0.022925	4418	synopsis : a mentally unstable man undergoing ...
4	neg	pos	0.043234	3911	capsule : in 2176 on the planet mars police ta...

	label	prediction	sentiment	length	excerpt
0	pos	pos	0.023663	4227	films adapted from comic books have had plenty...
1	pos	pos	0.131092	2421	you've got mail works alot better than it dese...
2	pos	pos	0.110626	6092	" jaws " is a rare film that grabs your atten...
3	pos	pos	0.103847	4096	every now and then a movie comes along from a ...
4	pos	neg	-0.070151	3898	moviemaking is a lot like being the general ma...

	label	prediction	sentiment	length	excerpt	accurate
0	neg	pos	0.026240	5953	bad . bad . \nbad . \nthat one word seems to p...	no
1	neg	pos	0.076040	3396	isn't it the ultimate sign of a movie's cinema...	no
2	neg	neg	-0.128733	2762	" gordy " is not a movie , it is a 90-minute-...	yes
3	neg	neg	-0.000485	3840	disconnect the phone line . \ndon't accept the...	yes
4	neg	pos	0.122770	2270	when robert forster found himself famous again...	no
...	...	...	...	...	...	...
995	neg	pos	0.145489	1945	synopsis : when a meteorite crashlands in the ...	no
996	neg	pos	0.102723	3116	it's now the anniversary of the slayings of ju...	no
997	neg	pos	0.042473	1755	coinciding with the emerging popularity of mov...	no
998	neg	neg	-0.048656	2826	and now the high-flying hong kong style of fil...	yes
999	neg	neg	-0.090655	4165	battlefield long , boring and just plain stupi...	yes

	label	prediction	sentiment	length	excerpt	accurate
0	pos	pos	0.221173	4662	assume nothing . \nthe phrase is perhaps one o...	yes
1	pos	pos	0.089736	3839	plot : derek zoolander is a male model . \nhe ...	yes
2	pos	pos	0.206743	9380	i actually am a fan of the original 1961 or so...	yes
3	pos	pos	0.141905	2407	a movie that's been as highly built up as the ...	yes
4	pos	pos	0.176332	1840	" good will hunting " is two movies in one : ...	yes
...	...	...	...	...	...	...
995	pos	pos	0.072815	2658	one of the funniest carry on movies and the th...	yes
996	pos	pos	0.102879	4196	i remember making a pact , right after `patch ...	yes
997	pos	pos	0.195097	2094	barely scrapping by playing at a nyc piano bar...	yes
998	pos	pos	0.117530	4575	if the current trends of hollywood filmmaking ...	yes
999	pos	neg	-0.013569	3870	capsule : the director of cure brings a weird ...	no