HW4 PIPELINE + HW6 + HW7 + HW8 (Topic Modeling)

Building off HW2 + HW3

In [18]:
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

## =======================================================
## TOKENIZING
## =======================================================
from nltk.tokenize import word_tokenize, sent_tokenize
def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

def get_sentence_tokens(review):
    return sent_tokenize(review)

## =======================================================
## REMOVING STOPWORDS
## =======================================================
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text

## =======================================================
## FREQUENCY DISTRIBUTIONS
## =======================================================
from nltk.probability import FreqDist
def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)

def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)

def get_fdist(tokens):
    return (FreqDist(tokens))

## =======================================================
## SENTIMENT ANALYSIS
## =======================================================
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

def get_vader_score(review):
    return sid.polarity_scores(review)

def separate_vader_score(vader_score, key):
    return vader_score[key]

## =======================================================
## SUMMARIZER
## =======================================================
def get_weighted_freq_dist(review, freq_dist):
    try:
        max_freq = max(freq_dist.values())
        for word in freq_dist.keys():
            freq_dist[word] = (freq_dist[word]/max_freq)
        return freq_dist
    except:
        for word in freq_dist.keys():
            freq_dist[word] = (freq_dist[word]/1)
        return freq_dist
        

def get_sentence_score(review, freq_dist):
    sentence_scores = {}
    for sent in review:
        for word in nltk.word_tokenize(sent.lower()):
            if word in freq_dist.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = freq_dist[word]
                    else:
                        sentence_scores[sent] += freq_dist[word]
    return sentence_scores

def get_summary_sentences(sentence_scores):
    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
    return ''.join(sent[0] for sent in sorted_sentences[:5])

def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])

## =======================================================
## MACHINE LEARNING -- NAIVE BAYES
## =======================================================
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report

# def get_NB(small_df, labels):
#     x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)

#     gnb = GaussianNB()
#     gnb.fit(x_train, y_train)
#     y_pred = gnb.predict(x_test)
#     print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
    
    
def get_NB(small_df, labels, classifier, title):
    seeds = [109, 210, 420, 19, 7]
    dfs = []
    overall = []
    print(title)
    for seed in seeds:
        x_train, x_test, y_train, y_test = train_test_split(small_df.values, 
                                                            labels, test_size=0.3, random_state = seed)
        gnb = classifier
        gnb.fit(x_train, y_train).score(x_train, y_train)
        y_pred = gnb.predict(x_test)
        accuracy =  metrics.accuracy_score(y_test, y_pred)
        report = metrics.classification_report(y_test, y_pred)
        print("Accuracy:", accuracy)
#         print(report)
        overall.append(accuracy)
        cm = confusion_matrix(y_test, y_pred)
        # confusion_matrix_graph(cm, accuracy, "NB Multinomial Tokenized")
#         t0, fp, fn, tp = cm.ravel()
        print(cm.ravel())
        df = pd.DataFrame(cm.ravel())
        dfs.append(df)
    print('AVERAGE ACCURACY:', sum(overall)/len(overall))
    return dfs


def display_NB_tables(dfs):
    for df in dfs:
        print(display(df))
        
## =======================================================
## PLOTS
## =======================================================        
import seaborn as sns
import matplotlib.pyplot as plt 
def bar_plot(df, title): 
    graph = sns.barplot(y = "count", x = "word", data = df, palette = "husl")
    plt.title(title)
    plt.xlabel("Word")
    plt.ylabel("Count")
    sns.set_context("talk")
    plt.xticks(rotation = 90)
    return plt

from nltk.tokenize import casual_tokenize
from collections import Counter


## =======================================================
## CLEANERS
## =======================================================   
import re, string
def diy_cleaner(review):
    try:
        both = review.split('\n')
        title = both[0]
        review = both[1]
        review = review.replace("'","")
    except:
        review = review.replace("'","")
    pattern = re.compile('[\W_]+')
    review = pattern.sub(' ', review)
    cleaned = title + ' ' + title + ' ' + review
    return cleaned.lower()

def pruner(review):
    clean_review = ' '.join([word for word in review.split() if len(word) > 3])
    return clean_review

sentim_analyzer = SentimentAnalyzer()
def get_nltk_negs(tokens):
    all_words_neg = sentim_analyzer.all_words([mark_negation(tokens)])
    return all_words_neg

def get_unigram_feats(neg_tokens):
    unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)
    return unigram_feats
    
def get_bigram_feats(tokens):
    ngrams = zip(*[tokens[i:] for i in range(2)])
    return ["_".join(ngram) for ngram in ngrams]

## =======================================================
## HELPERS
## =======================================================  
def get_bow_from_column(df, column):
    all_column_data = ' '.join(df[column].tolist())
    all_column_fd = Counter(all_column_data.split())
    return all_column_fd

def get_common_words(num):
    most_common_neg = [word[0] for word in big_bow_n.most_common(num)]
    most_common_pos = [word[0] for word in big_bow_p.most_common(num)]
    in_both = np.intersect1d(most_common_neg, most_common_pos)
    neg_notpos = np.setdiff1d(most_common_neg, most_common_pos)
    pos_notneg = np.setdiff1d(most_common_pos, most_common_neg)
    return [len(in_both), len(neg_notpos), len(pos_notneg), len(in_both)/num, in_both, neg_notpos, pos_notneg]

def get_only_polarized(tokens, common_words):
    return [token for token in tokens if token not in common_words[4]] # 70

## =======================================================
## VISUALS
## =======================================================  
# import wordcloud
# from wordcloud import WordCloud, ImageColorGenerator
# from PIL import Image
# import seaborn as sns
# import matplotlib.pyplot as plt 
# def create_word_cloud_with_mask(path_of_mask_image, dictionary, 
#                                 max_num_words, title):
#         mask = np.array(Image.open(path_of_mask_image))
#         word_cloud = WordCloud(background_color = "white", 
#                                max_words = max_num_words, 
#                                mask = mask, max_font_size = 125, 
#                                random_state = 1006)
#         word_cloud.generate_from_frequencies(dictionary)
#         image_colors = ImageColorGenerator(mask)
#         plt.figure(figsize = [8,8])
#         plt.imshow(word_cloud.recolor(color_func = image_colors), interpolation = "bilinear")
#         plt.title(title)
#         sns.set_context("poster")
#         plt.axis("off")
#         return plt
    
import seaborn as sns
import matplotlib.pyplot as plt 
def bar_plot(df, title): 
    graph = sns.barplot(y = "count", x = "word", data = df, palette = "husl")
    plt.title(title)
    plt.xlabel("Word")
    plt.ylabel("Count")
    sns.set_context("talk")
    plt.xticks(rotation = 90)
    return plt
In [10]:
import pandas as pd
import numpy as np

#########################
## KAGGLE SENTIMENT
#########################

train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values

all_df = pd.DataFrame(X)
all_df['labels'] = y
all_df
Out[10]:
0 labels
0 A series of escapades demonstrating the adage ... 1
1 A series of escapades demonstrating the adage ... 2
2 A series 2
3 A 2
4 series 2
... ... ...
156055 Hearst 's 2
156056 forced avuncular chortles 1
156057 avuncular chortles 3
156058 avuncular 2
156059 chortles 2

156060 rows × 2 columns

In [12]:
# neg = get_data_from_files('../NEG_JK/')
# pos = get_data_from_files('../POS_JK/')

# neg = get_data_from_files('../neg_cornell/')
# pos = get_data_from_files('../pos_cornell/')

# neg = get_data_from_files('../neg_hw4/')
# pos = get_data_from_files('../pos_hw4/')

# neg = get_data_from_files('../hw4_lie_false/')
# pos = get_data_from_files('../hw4_lie_true/')

# pos = get_data_from_files('../hw4_lie_false/')
# neg = get_data_from_files('../hw4_lie_true/')

# neg_df = pd.DataFrame(neg)
# pos_df = pd.DataFrame(pos)

# pos_df['PoN'] = 'P'
# neg_df['PoN'] = 'N'
# all_df = neg_df.append(pos_df)

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)

all_df = all_df.drop(all_df[all_df.num_tokens < 1].index)

all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)

all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)

all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)
all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)

all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)
all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)

all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)
all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)
all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)
all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)
all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)

all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)
all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)
all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)

all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)
all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)
all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)
all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)
all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)

all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)

all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)
all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)
all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)
all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)
all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)

all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)
all_df['bow_nosw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)

all_df['diy_cleaner'] = all_df.apply(lambda x: diy_cleaner(x[0]), axis=1)
all_df['pruned'] = all_df.apply(lambda x: pruner(x['diy_cleaner']), axis=1)

all_df['nltk_negs'] = all_df.apply(lambda x: get_nltk_negs(x['tokens']), axis=1)
all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)
all_df['bigram_feats'] = all_df.apply(lambda x: get_bigram_feats(x['tokens']), axis=1)
all_df['bigram_feats_neg'] = all_df.apply(lambda x: get_bigram_feats(x['nltk_negs']), axis=1)

big_bow = get_bow_from_column(all_df, 'pruned')
big_bow_1 = get_bow_from_column(all_df[all_df['labels'] == 1], 'pruned')
big_bow_2 = get_bow_from_column(all_df[all_df['labels'] == 2], 'pruned')
big_bow_3 = get_bow_from_column(all_df[all_df['labels'] == 3], 'pruned')
big_bow_4 = get_bow_from_column(all_df[all_df['labels'] == 4], 'pruned')
big_bow_5 = get_bow_from_column(all_df[all_df['labels'] == 5], 'pruned')

# most_common_1 = [word[0] for word in big_bow_n.most_common(100)]
# most_common_2 = [word[0] for word in big_bow_p.most_common(100)]



# all_df['no_shared_words'] = all_df.apply(lambda x: get_only_polarized(x['tokens'], get_common_words(500)), axis=1)
In [13]:
all_df[:3]
Out[13]:
0 labels tokens num_tokens sentences num_sentences no_sw num_no_sw topwords_unfil topwords_fil ... v_neu_fd v_pos_fd bow bow_nosw diy_cleaner pruned nltk_negs unigram_feats bigram_feats bigram_feats_neg
0 A series of escapades demonstrating the adage ... 1 [a, series, of, escapades, demonstrating, the,... 35 [A series of escapades demonstrating the adage... 1 [series, escapades, demonstrating, adage, good... 15 [(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)... [(good, 2), (series, 1), (escapades, 1), (demo... ... 0.693 0.307 {'a': 2, 'series': 1, 'of': 4, 'escapades': 1,... {'series': 1, 'escapades': 1, 'demonstrating':... a series of escapades demonstrating the adage ... series escapades demonstrating adage that what... [a, series, of, escapades, demonstrating, the,... [the, of, is, good, for, of_NEG, a, series, es... [a_series, series_of, of_escapades, escapades_... [a_series, series_of, of_escapades, escapades_...
1 A series of escapades demonstrating the adage ... 2 [a, series, of, escapades, demonstrating, the,... 14 [A series of escapades demonstrating the adage... 1 [series, escapades, demonstrating, adage, good... 6 [(the, 2), (a, 1), (series, 1), (of, 1), (esca... [(series, 1), (escapades, 1), (demonstrating, ... ... 0.633 0.367 {'a': 1, 'series': 1, 'of': 1, 'escapades': 1,... {'series': 1, 'escapades': 1, 'demonstrating':... a series of escapades demonstrating the adage ... series escapades demonstrating adage that what... [a, series, of, escapades, demonstrating, the,... [the, a, series, of, escapades, demonstrating,... [a_series, series_of, of_escapades, escapades_... [a_series, series_of, of_escapades, escapades_...
2 A series 2 [a, series] 2 [A series] 1 [series] 1 [(a, 1), (series, 1)] [(series, 1)] ... 1.000 0.000 {'a': 1, 'series': 1} {'series': 1} a series a series a series series series series [a, series] [a, series] [a_series] [a_series]

3 rows × 39 columns

SAVE TO CSV!

In [14]:
all_df.to_csv('hw7_data_sentiment_v2.csv',index=False)
all_df['PoN'] = all_df['labels']
all_df
Out[14]:
0 labels tokens num_tokens sentences num_sentences no_sw num_no_sw topwords_unfil topwords_fil ... v_pos_fd bow bow_nosw diy_cleaner pruned nltk_negs unigram_feats bigram_feats bigram_feats_neg PoN
0 A series of escapades demonstrating the adage ... 1 [a, series, of, escapades, demonstrating, the,... 35 [A series of escapades demonstrating the adage... 1 [series, escapades, demonstrating, adage, good... 15 [(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)... [(good, 2), (series, 1), (escapades, 1), (demo... ... 0.307 {'a': 2, 'series': 1, 'of': 4, 'escapades': 1,... {'series': 1, 'escapades': 1, 'demonstrating':... a series of escapades demonstrating the adage ... series escapades demonstrating adage that what... [a, series, of, escapades, demonstrating, the,... [the, of, is, good, for, of_NEG, a, series, es... [a_series, series_of, of_escapades, escapades_... [a_series, series_of, of_escapades, escapades_... 1
1 A series of escapades demonstrating the adage ... 2 [a, series, of, escapades, demonstrating, the,... 14 [A series of escapades demonstrating the adage... 1 [series, escapades, demonstrating, adage, good... 6 [(the, 2), (a, 1), (series, 1), (of, 1), (esca... [(series, 1), (escapades, 1), (demonstrating, ... ... 0.367 {'a': 1, 'series': 1, 'of': 1, 'escapades': 1,... {'series': 1, 'escapades': 1, 'demonstrating':... a series of escapades demonstrating the adage ... series escapades demonstrating adage that what... [a, series, of, escapades, demonstrating, the,... [the, a, series, of, escapades, demonstrating,... [a_series, series_of, of_escapades, escapades_... [a_series, series_of, of_escapades, escapades_... 2
2 A series 2 [a, series] 2 [A series] 1 [series] 1 [(a, 1), (series, 1)] [(series, 1)] ... 0.000 {'a': 1, 'series': 1} {'series': 1} a series a series a series series series series [a, series] [a, series] [a_series] [a_series] 2
3 A 2 [a] 1 [A] 1 [] 0 [(a, 1)] [] ... 0.000 {'a': 1} {} a a a [a] [a] [] [] 2
4 series 2 [series] 1 [series] 1 [series] 1 [(series, 1)] [(series, 1)] ... 0.000 {'series': 1} {'series': 1} series series series series series series [series] [series] [] [] 2
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
156055 Hearst 's 2 [hearst] 1 [Hearst 's] 1 [hearst] 1 [(hearst, 1)] [(hearst, 1)] ... 0.000 {'hearst': 1} {'hearst': 1} hearst 's hearst 's hearst s hearst hearst hearst [hearst] [hearst] [] [] 2
156056 forced avuncular chortles 1 [forced, avuncular, chortles] 3 [forced avuncular chortles] 1 [forced, avuncular, chortles] 3 [(forced, 1), (avuncular, 1), (chortles, 1)] [(forced, 1), (avuncular, 1), (chortles, 1)] ... 0.000 {'forced': 1, 'avuncular': 1, 'chortles': 1} {'forced': 1, 'avuncular': 1, 'chortles': 1} forced avuncular chortles forced avuncular cho... forced avuncular chortles forced avuncular cho... [forced, avuncular, chortles] [forced, avuncular, chortles] [forced_avuncular, avuncular_chortles] [forced_avuncular, avuncular_chortles] 1
156057 avuncular chortles 3 [avuncular, chortles] 2 [avuncular chortles] 1 [avuncular, chortles] 2 [(avuncular, 1), (chortles, 1)] [(avuncular, 1), (chortles, 1)] ... 0.000 {'avuncular': 1, 'chortles': 1} {'avuncular': 1, 'chortles': 1} avuncular chortles avuncular chortles avuncula... avuncular chortles avuncular chortles avuncula... [avuncular, chortles] [avuncular, chortles] [avuncular_chortles] [avuncular_chortles] 3
156058 avuncular 2 [avuncular] 1 [avuncular] 1 [avuncular] 1 [(avuncular, 1)] [(avuncular, 1)] ... 0.000 {'avuncular': 1} {'avuncular': 1} avuncular avuncular avuncular avuncular avuncular avuncular [avuncular] [avuncular] [] [] 2
156059 chortles 2 [chortles] 1 [chortles] 1 [chortles] 1 [(chortles, 1)] [(chortles, 1)] ... 0.000 {'chortles': 1} {'chortles': 1} chortles chortles chortles chortles chortles chortles [chortles] [chortles] [] [] 2

154050 rows × 40 columns

In [19]:
small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu'])
tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores -- Gaussian')
# display_NB_tables(tables)
Vader Scores -- Gaussian
Accuracy: 0.5212593313859136
[  154   764  1031   212    19   252  1893  4803   937   123   234  1958
 18145  2414   782    45   445  4544  3060  1692     2    73   793  1002
   838]
Accuracy: 0.5159796602834578
[  196   649  1029   204    25   382  1799  4858   990    91   394  1848
 17786  2783   582    56   467  4522  3305  1391     5    75   797  1221
   760]
Accuracy: 0.5136427566807313
[  211   693  1032   193    32   444  1778  4807   925   123   410  1802
 17971  2467   809    49   415  4636  2976  1668     6    77   820  1069
   802]
Accuracy: 0.515070864437953
[  174   701  1037   184    32   319  1826  4896   936   134   333  1949
 17924  2570   706    58   434  4647  3069  1578     4    79   802  1012
   811]
Accuracy: 0.514226982581413
[  164   707  1001   192    19   284  1897  4922   929   124   300  1897
 17822  2514   775    59   437  4595  3037  1699     3    77   831  1085
   845]
AVERAGE ACCURACY: 0.5160359190738937
In [20]:
small_df = all_df.filter(['v_pos','v_neu'])
tables = get_NB(small_df, all_df['PoN'], MultinomialNB(), 'Positive Vader Scores -- Multinomial')
Positive Vader Scores -- Multinomial
Accuracy: 0.5118684409823651
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
[    0     0  2180     0     0     0     0  8004     4     0     0     0
 23351   182     0     0     0  9481   305     0     0     0  2588   120
     0]
Accuracy: 0.5092935194201017
[    0     0  2102     1     0     0     0  8113     7     0     0     0
 23180   213     0     0     0  9384   357     0     0     0  2691   167
     0]
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Accuracy: 0.5111976630963972
[    0     0  2159     2     0     0     0  8069     8     0     0     0
 23252   207     0     0     0  9371   373     0     0     0  2623   151
     0]
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.
  'precision', 'predicted', average, warn_for)
Accuracy: 0.5115005950448989
[    0     0  2127     1     0     0     0  8104     7     0     0     0
 23272   210     0     0     0  9419   367     0     0     0  2539   169
     0]
Accuracy: 0.5078004976739154
[    0     0  2083     0     0     0     0  8149     7     0     0     0
 23108   200     0     0     0  9467   360     0     0     0  2715   126
     0]
AVERAGE ACCURACY: 0.5103321432435356
In [21]:
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) 
tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores from Summary -- Gaussian')
Vader Scores from Summary -- Gaussian
Accuracy: 0.5090987774532079
[  211   661   978   316    14   467  1603  4723  1099   116   459  1685
 17672  2919   798    49   404  4436  3246  1651     3    67   751  1091
   796]
Accuracy: 0.5043600562587904
[  225   581   983   291    23   517  1592  4780  1147    84   582  1605
 17309  3305   592    60   430  4428  3461  1362     4    70   754  1308
   722]
Accuracy: 0.5017202207075625
[  231   643   973   284    30   570  1555  4740  1100   112   579  1564
 17490  3010   816    55   374  4535  3146  1634     4    70   777  1158
   765]
Accuracy: 0.5042085902845397
[  209   627   985   283    24   467  1612  4801  1108   123   497  1713
 17466  3041   765    64   389  4529  3245  1559     2    70   763  1103
   770]
Accuracy: 0.5032565184463919
[  210   620   959   278    16   475  1637  4844  1085   115   481  1657
 17355  3023   792    68   386  4473  3252  1648     4    71   795  1167
   804]
AVERAGE ACCURACY: 0.5045288326300985
In [22]:
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound','v_pos', 'v_neg', 'v_neu'])
tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores (original) and Vader Scores (summary) -- Gaussian')
Vader Scores (original) and Vader Scores (summary) -- Gaussian
Accuracy: 0.4752569512063183
[  640   391   850   214    85  1510   916  4295   767   520  1550   927
 17540  1307  2209   247   369  4122  1393  3655    27    74   676   456
  1475]
Accuracy: 0.4712971978794764
[  638   308   869   185   103  1601   874  4355   818   472  1611   917
 17214  1438  2213   279   377  4126  1500  3459    39    55   700   509
  1555]
Accuracy: 0.47209780374337335
[  674   345   855   180   107  1609   891  4338   731   508  1583   918
 17384  1295  2279   263   340  4228  1358  3555    43    65   690   465
  1511]
Accuracy: 0.47082116196040247
[  652   333   869   165   109  1491   961  4370   762   527  1575  1000
 17349  1224  2334   260   359  4232  1290  3645    34    70   686   411
  1507]
Accuracy: 0.4714703018500487
[  634   339   837   177    96  1534   958  4377   791   496  1559   931
 17214  1373  2231   278   350  4171  1453  3575    34    72   707   498
  1530]
AVERAGE ACCURACY: 0.4721886833279238
In [23]:
small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd'])
tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores 50 most frequent filtered words -- Gaussian')
Vader Scores 50 most frequent filtered words -- Gaussian
Accuracy: 0.5117602510007573
[  213   711   952   243    61   411  1721  4525  1071   280   432  1889
 17660  2728   824    66   497  4304  3111  1808     5    77   729   951
   946]
Accuracy: 0.5124093908904035
[  221   632   969   225    56   454  1714  4513  1248   191   471  1817
 17407  3108   590    74   492  4309  3498  1368     8    75   747  1187
   841]
Accuracy: 0.5063940279130152
[  253   658   950   229    71   571  1639  4493  1083   291   529  1764
 17521  2769   876    74   471  4386  3062  1751    13    76   751  1006
   928]
Accuracy: 0.5075841177107
[  200   689   953   227    59   370  1761  4539  1136   305   437  1893
 17559  2684   909    71   477  4404  2959  1875     7    89   738   895
   979]
Accuracy: 0.5061560099534783
[  218   662   928   215    60   523  1646  4615  1095   277   509  1739
 17437  2775   848    81   464  4367  3113  1802     8    80   766  1009
   978]
AVERAGE ACCURACY: 0.5088607594936708

Bag of Words & Machine Learning

In [ ]:
all_df['bow_v1'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)
all_df
new_df = pd.DataFrame(all_df['bow_v1'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, GaussianNB(), 'Starting point -- Gaussian')
Starting point -- Gaussian
Accuracy: 0.18355512279562913
[1714  336   44   14   72 4525 2124  196  414  749 9992 3648  806 3145
 5942 2204  546  225 1886 4925  365   37   29  324 1953]
Accuracy: 0.1868873742291464
[1608  371   33   34   57 4519 2162  216  474  749 9978 3492  853 3324
 5746 2312  521  233 1960 4715  382   20   33  369 2054]
In [ ]:
tables = get_NB(new_df, new_df.index, GaussianNB(), 'Starting point -- Gaussian')
In [ ]:
all_df['bow_v1'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)
new_df = pd.DataFrame(all_df['bow_v1'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Starting point -- Multinomial')
In [ ]:
new_df = new_df.astype(bool).astype(int)
tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Starting point -- Bernoulli')
In [ ]:
all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)
new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, GaussianNB(), 'DIY Cleaner -- Gaussian')
In [ ]:
all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)
new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, MultinomialNB(), 'DIY Cleaner -- Multinomial')
In [ ]:
new_df = new_df.astype(bool).astype(int)
tables = get_NB(new_df, new_df.index, BernoulliNB(), 'DIY Cleaner -- Bernoulli')
In [ ]:
all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)
new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, GaussianNB(), 'Pruned Words -- Gaussian')
In [ ]:
all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)
new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Pruned Words -- Multinomial')
In [ ]:
new_df = new_df.astype(bool).astype(int)
tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Pruned Words -- Bernoulli')
In [ ]:
all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, GaussianNB(), 'NLTK negs -- Gaussian')
In [ ]:
all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, GaussianNB(), 'NLTK negs -- Multinomial')
In [ ]:
new_df = new_df.astype(bool).astype(int)
tables = get_NB(new_df, new_df.index, BernoulliNB(), 'NLTK negs -- Bernoulli')
In [ ]:
all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, GaussianNB(), 'Bigram Feats -- Gaussian')
In [ ]:
all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Bigram Feats -- Multinomial')
In [ ]:
new_df = new_df.astype(bool).astype(int)
tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Bigram Feats -- Bernoulli')
In [ ]:
all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, GaussianNB(), 'No Shared Words -- Gaussian')
In [ ]:
all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
tables = get_NB(new_df, new_df.index, MultinomialNB(), 'No Shared Words -- Multinomial')
In [ ]:
new_df = new_df.astype(bool).astype(int)
tables = get_NB(new_df, new_df.index, BernoulliNB(), 'No Shared Words -- Bernoulli')
In [ ]:
create_word_cloud_with_mask('yellow_square.png', big_bow, 750, "Top Words")
In [ ]:
create_word_cloud_with_mask('red_square.png', big_bow_n, 750, "Top Negative Words")
In [ ]:
create_word_cloud_with_mask('green_square.png', big_bow_p, 750, "Top Positive Words")
In [ ]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import BernoulliNB, MultinomialNB

def runPipeline(classifier, boolean, cv, X, y):
    nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=boolean)),('nb', classifier)])
    scores = cross_val_score(nb_clf_pipe, X, y, cv=cv)
    avg=sum(scores)/len(scores)
#     pretty_line = "{} | Accuracy using {} -- and booleans? {}"
    pretty_line = "{} | B? {} | CV: {} | Classifier: {}"
    print(pretty_line.format(avg, str(boolean)[0], cv, str(classifier).split('(')[0]))

# X = array of data
# y = array of labels

hw6 = all_df[[0,'PoN']]
X = hw6[0].tolist()
y = hw6['PoN'].tolist()

runPipeline(BernoulliNB(), False, 5, X=X, y=y)
runPipeline(BernoulliNB(), False, 3, X=X, y=y)
runPipeline(MultinomialNB(), False, 5, X=X, y=y)
runPipeline(MultinomialNB(), False, 3, X=X, y=y)
runPipeline(MultinomialNB(), True, 5,  X=X, y=y)
runPipeline(MultinomialNB(), True, 3,  X=X, y=y)
In [ ]:
from tabulate import tabulate

df = hw6
def shorten(long_string):
    return long_string[:1] if len(long_string) < 21 else long_string[:20]

def df_for_tabulate(df, column):
    pretty_df = df.copy()
    pretty_df[column] = pretty_df.apply(lambda x: shorten(x[column]), axis = 1)
    return pretty_df
    
tabulate_df = df_for_tabulate(df, 0)
print(tabulate(tabulate_df[:10], tablefmt="simple", headers=tabulate_df.columns))
In [ ]:
 
In [ ]: