SENTIMENT ANALYSIS

(via these docs) | 10-06-19

STEP 1: Import ALL the things

In [55]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

STEP 2: Borrow subjective and objective sentences from the nltk for practice

In [56]:
n_instances = 100
subj_docs = [(sent, 'subj') for sent in subjectivity.sents(categories='subj')[:n_instances]]
obj_docs = [(sent, 'obj') for sent in subjectivity.sents(categories='obj')[:n_instances]]

STEP 3: Create test and train for both subj and obj

In [57]:
train_subj_docs = subj_docs[:80]
test_subj_docs = subj_docs[80:100]
train_obj_docs = obj_docs[:80]
test_obj_docs = obj_docs[80:100]

STEP 4: Combine the two test and train sets

In [58]:
training_docs = train_subj_docs + train_obj_docs
testing_docs = test_subj_docs + test_obj_docs

STEP 5: Use SentimentAnalyzer to mark negation in training docs

In [59]:
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
In [60]:
all_words_neg[25:45]
Out[60]:
['.',
 'and',
 'just',
 'enough',
 'science',
 'to',
 'send',
 'you',
 'home',
 'thinking',
 '.',
 'it',
 'is',
 'not',
 'a_NEG',
 'mass-market_NEG',
 'entertainment_NEG',
 'but_NEG',
 'an_NEG',
 'uncompromising_NEG']

Note how this sentiment analyzer is marking everything after a negation word with '_NEG'

This is one of many ways we can determine sentiment

STEP 6: Use unigram_word_feats to get unigrams features

In [61]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg, min_freq=4)
len(unigram_feats)
Out[61]:
83

STEP 7: Use add_feat_extractor to get a feature-value representation of our data

Apply to both training_set and testing_set

In [62]:
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
In [63]:
training_set = sentim_analyzer.apply_features(training_docs)
training_set[:1]
Out[63]:
[({'contains(.)': True, 'contains(the)': False, 'contains(,)': True, 'contains(a)': True, 'contains(and)': True, 'contains(of)': False, 'contains(to)': False, 'contains(is)': True, 'contains(in)': False, 'contains(with)': False, 'contains(it)': False, 'contains(that)': False, 'contains(his)': False, 'contains(on)': False, 'contains(for)': False, 'contains(an)': False, 'contains(who)': False, 'contains(by)': False, 'contains(he)': False, 'contains(from)': False, 'contains(her)': False, 'contains(")': False, 'contains(film)': False, 'contains(as)': False, 'contains(this)': False, 'contains(movie)': False, 'contains(their)': False, 'contains(but)': False, 'contains(one)': True, 'contains(at)': False, 'contains(about)': True, 'contains(the_NEG)': False, 'contains(a_NEG)': False, 'contains(to_NEG)': False, 'contains(are)': False, "contains(there's)": False, 'contains(()': False, 'contains(story)': False, 'contains(when)': False, 'contains(so)': False, 'contains(be)': False, 'contains(,_NEG)': False, 'contains())': False, 'contains(they)': False, 'contains(you)': False, 'contains(not)': False, 'contains(have)': False, 'contains(like)': False, 'contains(will)': False, 'contains(all)': False, 'contains(into)': False, 'contains(out)': False, 'contains(she)': False, 'contains(what)': False, 'contains(life)': False, 'contains(has)': False, 'contains(its)': False, 'contains(only)': False, 'contains(more)': False, 'contains(even)': False, 'contains(--)': False, 'contains(:)': False, 'contains(can)': False, 'contains(;)': False, 'contains(home)': False, 'contains(look)': False, "contains(it's)": False, 'contains(if)': False, 'contains(where)': False, 'contains(most)': False, 'contains(him)': False, 'contains(search)': False, 'contains(but_NEG)': False, 'contains(love)': False, 'contains(both)': False, 'contains(make)': False, 'contains(begins)': False, 'contains(some)': False, 'contains(two)': False, 'contains(of_NEG)': False, 'contains(made)': False, 'contains(which)': False, 'contains(them)': False}, 'subj')]
In [64]:
test_set = sentim_analyzer.apply_features(testing_docs)
test_set[:1]
Out[64]:
[({'contains(.)': True, 'contains(the)': True, 'contains(,)': False, 'contains(a)': True, 'contains(and)': False, 'contains(of)': True, 'contains(to)': False, 'contains(is)': False, 'contains(in)': False, 'contains(with)': True, 'contains(it)': False, 'contains(that)': False, 'contains(his)': False, 'contains(on)': False, 'contains(for)': True, 'contains(an)': False, 'contains(who)': False, 'contains(by)': False, 'contains(he)': False, 'contains(from)': False, 'contains(her)': False, 'contains(")': False, 'contains(film)': False, 'contains(as)': False, 'contains(this)': False, 'contains(movie)': False, 'contains(their)': False, 'contains(but)': False, 'contains(one)': False, 'contains(at)': False, 'contains(about)': False, 'contains(the_NEG)': False, 'contains(a_NEG)': False, 'contains(to_NEG)': False, 'contains(are)': False, "contains(there's)": False, 'contains(()': False, 'contains(story)': False, 'contains(when)': False, 'contains(so)': False, 'contains(be)': False, 'contains(,_NEG)': False, 'contains())': False, 'contains(they)': False, 'contains(you)': False, 'contains(not)': False, 'contains(have)': False, 'contains(like)': False, 'contains(will)': False, 'contains(all)': False, 'contains(into)': False, 'contains(out)': False, 'contains(she)': False, 'contains(what)': False, 'contains(life)': False, 'contains(has)': False, 'contains(its)': False, 'contains(only)': False, 'contains(more)': False, 'contains(even)': False, 'contains(--)': False, 'contains(:)': False, 'contains(can)': False, 'contains(;)': False, 'contains(home)': False, 'contains(look)': False, "contains(it's)": False, 'contains(if)': False, 'contains(where)': False, 'contains(most)': False, 'contains(him)': False, 'contains(search)': False, 'contains(but_NEG)': False, 'contains(love)': False, 'contains(both)': False, 'contains(make)': False, 'contains(begins)': False, 'contains(some)': False, 'contains(two)': False, 'contains(of_NEG)': False, 'contains(made)': False, 'contains(which)': False, 'contains(them)': False}, 'subj')]

STEP 8: FINAL STEP!! We use Naive Bayes to create a trainer and FINALLY classify our data!

In [65]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
Training classifier
In [66]:
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key,value))
Evaluating NaiveBayesClassifier results...
Accuracy: 0.8
F-measure [obj]: 0.8
F-measure [subj]: 0.8
Precision [obj]: 0.8
Precision [subj]: 0.8
Recall [obj]: 0.8
Recall [subj]: 0.8