SENTIMENT ANALYSIS -- (WITH HOMEMADE DATA!)

(via these docs) | 10-06-19

STEP 1: Import ALL the things

In [11]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

STEP 2: Import the fake data you created just for this project

In [44]:
import os
import pandas as pd
negative = os.listdir('AI_NEG/')
positive = os.listdir('AI_POS/')
positive_alltext = []
for file in positive:
    f=open('AI_POS/'+file)
    content=f.read()
    positive_alltext.append(content)
    f.close()

negative_alltext = []
for file in negative:
    f=open('AI_NEG/'+file)
    content=f.read()
    negative_alltext.append(content)
    f.close()

STEP 2b: Tokenize and clean the data

In [47]:
from nltk.tokenize import word_tokenize

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

negative_alltext_tokens = [get_tokens(sentence) for sentence in negative_alltext]
positive_alltext_tokens = [get_tokens(sentence) for sentence in positive_alltext]
In [48]:
neg_docs = [(sent, 'neg') for sent in negative_alltext_tokens]
pos_docs = [(sent, 'pos') for sent in positive_alltext_tokens]

STEP 3: Create test and train for both subj and obj

In [49]:
train_neg_docs = neg_docs[:4]
test_neg_docs = neg_docs[4:5]
train_pos_docs = pos_docs[:4]
test_pos_docs = pos_docs[4:5]

STEP 4: Combine the two test and train sets

In [58]:
training_docs = train_neg_docs + train_pos_docs
testing_docs = test_neg_docs + test_pos_docs

STEP 5: Use SentimentAnalyzer to mark negation in training docs

In [59]:
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
In [62]:
all_words_neg
Out[62]:
['where',
 'are',
 'the',
 'jobs',
 'oh',
 'that',
 'right',
 'artificial',
 'intelligence',
 'took',
 'our',
 'jobs',
 'how',
 'can',
 'we',
 'trust',
 'artificial',
 'intelligence',
 'to',
 'drive',
 'our',
 'cars',
 'when',
 'they',
 'ca',
 'even',
 'hack',
 'a',
 'captcha',
 'i',
 'hate',
 'artificial',
 'intelligence',
 'my',
 'dog',
 'is',
 'terrified',
 'by',
 'artificial',
 'intelligence',
 'my',
 'dog',
 'is',
 'excited',
 'by',
 'the',
 'advancements',
 'in',
 'artificial',
 'intelligence',
 'i',
 'excited',
 'for',
 'my',
 'child',
 'to',
 'grow',
 'up',
 'and',
 'have',
 'time',
 'to',
 'daydream',
 'because',
 'artificial',
 'intelligence',
 'has',
 'taken',
 'care',
 'of',
 'all',
 'the',
 'nitty',
 'gritty',
 'i',
 'love',
 'artificial',
 'intelligence',
 'order',
 'my',
 'groceries',
 'pay',
 'my',
 'taxes',
 'take',
 'my',
 'kids',
 'to',
 'school',
 'yes',
 'please',
 'artificial',
 'intelligence',
 'has',
 'given',
 'me',
 'my',
 'life',
 'back']

Note how this sentiment analyzer is SUPPOSED to mark everything after a negation word with '_NEG'

However, we do not have enough data in our 10 text file dataset to actually run this successfully

STEP 6: Use unigram_word_feats to get unigrams features

In [63]:
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
len(unigram_feats)
Out[63]:
65

STEP 7: Use add_feat_extractor to get a feature-value representation of our data

Apply to both training_set and testing_set

In [64]:
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
In [65]:
training_set = sentim_analyzer.apply_features(training_docs)
training_set[:1]
Out[65]:
[({'contains(artificial)': True, 'contains(intelligence)': True, 'contains(my)': False, 'contains(to)': False, 'contains(the)': True, 'contains(i)': False, 'contains(jobs)': True, 'contains(our)': True, 'contains(dog)': False, 'contains(is)': False, 'contains(by)': False, 'contains(excited)': False, 'contains(has)': False, 'contains(where)': True, 'contains(are)': True, 'contains(oh)': True, 'contains(that)': True, 'contains(right)': True, 'contains(took)': True, 'contains(how)': False, 'contains(can)': False, 'contains(we)': False, 'contains(trust)': False, 'contains(drive)': False, 'contains(cars)': False, 'contains(when)': False, 'contains(they)': False, 'contains(ca)': False, 'contains(even)': False, 'contains(hack)': False, 'contains(a)': False, 'contains(captcha)': False, 'contains(hate)': False, 'contains(terrified)': False, 'contains(advancements)': False, 'contains(in)': False, 'contains(for)': False, 'contains(child)': False, 'contains(grow)': False, 'contains(up)': False, 'contains(and)': False, 'contains(have)': False, 'contains(time)': False, 'contains(daydream)': False, 'contains(because)': False, 'contains(taken)': False, 'contains(care)': False, 'contains(of)': False, 'contains(all)': False, 'contains(nitty)': False, 'contains(gritty)': False, 'contains(love)': False, 'contains(order)': False, 'contains(groceries)': False, 'contains(pay)': False, 'contains(taxes)': False, 'contains(take)': False, 'contains(kids)': False, 'contains(school)': False, 'contains(yes)': False, 'contains(please)': False, 'contains(given)': False, 'contains(me)': False, 'contains(life)': False, 'contains(back)': False}, 'neg')]
In [66]:
test_set = sentim_analyzer.apply_features(testing_docs)
test_set[:1]
Out[66]:
[({'contains(artificial)': True, 'contains(intelligence)': True, 'contains(my)': False, 'contains(to)': True, 'contains(the)': True, 'contains(i)': False, 'contains(jobs)': False, 'contains(our)': True, 'contains(dog)': False, 'contains(is)': True, 'contains(by)': False, 'contains(excited)': False, 'contains(has)': False, 'contains(where)': False, 'contains(are)': False, 'contains(oh)': False, 'contains(that)': False, 'contains(right)': False, 'contains(took)': False, 'contains(how)': False, 'contains(can)': False, 'contains(we)': False, 'contains(trust)': False, 'contains(drive)': False, 'contains(cars)': False, 'contains(when)': False, 'contains(they)': False, 'contains(ca)': False, 'contains(even)': False, 'contains(hack)': False, 'contains(a)': False, 'contains(captcha)': False, 'contains(hate)': False, 'contains(terrified)': False, 'contains(advancements)': False, 'contains(in)': False, 'contains(for)': False, 'contains(child)': False, 'contains(grow)': False, 'contains(up)': False, 'contains(and)': False, 'contains(have)': False, 'contains(time)': False, 'contains(daydream)': False, 'contains(because)': False, 'contains(taken)': False, 'contains(care)': False, 'contains(of)': True, 'contains(all)': False, 'contains(nitty)': False, 'contains(gritty)': False, 'contains(love)': False, 'contains(order)': False, 'contains(groceries)': False, 'contains(pay)': False, 'contains(taxes)': False, 'contains(take)': False, 'contains(kids)': False, 'contains(school)': False, 'contains(yes)': False, 'contains(please)': False, 'contains(given)': False, 'contains(me)': False, 'contains(life)': False, 'contains(back)': False}, 'neg')]

STEP 8: FINAL STEP!! We use Naive Bayes to create a trainer and FINALLY classify our data!

In [67]:
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
Training classifier
In [68]:
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
    print('{0}: {1}'.format(key,value))
Evaluating NaiveBayesClassifier results...
Accuracy: 1.0
F-measure [neg]: 1.0
F-measure [pos]: 1.0
Precision [neg]: 1.0
Precision [pos]: 1.0
Recall [neg]: 1.0
Recall [pos]: 1.0

CONCLUSION: We need more data