(via these docs) | 10-06-19
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
import os
import pandas as pd
negative = os.listdir('AI_NEG/')
positive = os.listdir('AI_POS/')
positive_alltext = []
for file in positive:
f=open('AI_POS/'+file)
content=f.read()
positive_alltext.append(content)
f.close()
negative_alltext = []
for file in negative:
f=open('AI_NEG/'+file)
content=f.read()
negative_alltext.append(content)
f.close()
from nltk.tokenize import word_tokenize
def get_tokens(sentence):
tokens = word_tokenize(sentence)
clean_tokens = [word.lower() for word in tokens if word.isalpha()]
return clean_tokens
negative_alltext_tokens = [get_tokens(sentence) for sentence in negative_alltext]
positive_alltext_tokens = [get_tokens(sentence) for sentence in positive_alltext]
neg_docs = [(sent, 'neg') for sent in negative_alltext_tokens]
pos_docs = [(sent, 'pos') for sent in positive_alltext_tokens]
test
and train
for both subj
and obj
¶train_neg_docs = neg_docs[:4]
test_neg_docs = neg_docs[4:5]
train_pos_docs = pos_docs[:4]
test_pos_docs = pos_docs[4:5]
test
and train
sets¶training_docs = train_neg_docs + train_pos_docs
testing_docs = test_neg_docs + test_pos_docs
SentimentAnalyzer
to mark negation in training docs¶sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
all_words_neg
However, we do not have enough data in our 10 text file dataset to actually run this successfully
unigram_word_feats
to get unigrams features¶unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
len(unigram_feats)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(training_docs)
training_set[:1]
test_set = sentim_analyzer.apply_features(testing_docs)
test_set[:1]
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
print('{0}: {1}'.format(key,value))