via this tutorial |10-6-19
from textblob import TextBlob
from IPython.display import display, HTML
import os
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file)
results.append(f.read())
f.close()
return results
neg_k = get_data_from_files('AI_NEG/')
pos_k = get_data_from_files('AI_POS/')
neg_a = get_data_from_files('NEG/')
pos_a = get_data_from_files('POS/')
def get_pn(num):
return 'neg' if num < 0 else 'pos'
def get_sentiment(array, label):
blobs = [[TextBlob(text), text] for text in array]
return ([{'label': label,
'prediction': get_pn(obj.sentiment.polarity),
'sentiment': obj.sentiment.polarity,
'length': len(text),
'excerpt': text[:50]} for obj,text in blobs])
display(pd.DataFrame(get_sentiment(neg_k, 'neg')))
display(pd.DataFrame(get_sentiment(pos_k, 'pos')))
display(pd.DataFrame(get_sentiment(neg_a, 'neg')))
display(pd.DataFrame(get_sentiment(pos_a, 'pos')))
def get_vader_scores(array, label):
vader_array = []
for sentence in array:
ss = sid.polarity_scores(sentence)
vader_array.append({'label': label, 'compound': ss['compound'], 'excerpt': sentence[:50]})
return vader_array
display(pd.DataFrame(get_vader_scores(neg_k, 'neg')))
display(pd.DataFrame(get_vader_scores(pos_k, 'pos')))
display(pd.DataFrame(get_vader_scores(neg_a, 'neg')))
display(pd.DataFrame(get_vader_scores(pos_a, 'pos')))
from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
def get_tokens(sentence):
tokens = word_tokenize(sentence)
clean_tokens = [word.lower() for word in tokens if word.isalpha()]
return clean_tokens
def get_nltk_train_test(array, label):
tokens = [get_tokens(sentence) for sentence in array]
docs = [(sent, label) for sent in tokens]
train_docs = docs[:4]
test_docs = docs[4:5]
return [train_docs, test_docs]
def get_nltk_NB(NEG_DATA, POS_DATA):
train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg')
train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos')
training_docs = train_neg + train_pos
testing_docs = test_neg + test_pos
sentim_analyzer = SentimentAnalyzer()
all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
training_set = sentim_analyzer.apply_features(training_docs)
test_set = sentim_analyzer.apply_features(testing_docs)
trainer = NaiveBayesClassifier.train
classifier = sentim_analyzer.train(trainer, training_set)
for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
print('{0}: {1}'.format(key,value))
get_nltk_NB(neg_k, pos_k)
get_nltk_NB(neg_a, pos_a)