HW2: VECTORIZATION (Pandas style!)¶

STEP 1: Import ALL the things¶

Import libraries¶

##########################################
# NOTE: I'm toying with the idea of requiring the library just above 
# when I use it so it makes more sense in context
##########################################
# import os
# import pandas as pd
# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.sentiment import SentimentAnalyzer
# from nltk.sentiment.util import *
# from nltk.probability import FreqDist
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sid = SentimentIntensityAnalyzer()

Import data from files¶

import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

neg = get_data_from_files('../NEG_JK/')
pos = get_data_from_files('../POS_JK/')

STEP 2: Prep Data¶

STEP 2a: Turn that fresh text into a pandas DF¶

import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)

STEP 2b: Label it¶

pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'

STEP 2c: Combine the dfs¶

all_df = neg_df.append(pos_df)

all_df

STEP 3: TOKENIZE (and clean)!!¶

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

## Came back and added sentences for tokinization for "Summary experiment"
def get_sentence_tokens(review):
    return sent_tokenize(review)
    
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)

all_df

STEP 4: Remove Stopwords¶

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)

all_df

STEP 5: Create a Frequency Distribution¶

from nltk.probability import FreqDist
def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)
all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)

def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)
all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)

def get_fdist(tokens):
    return (FreqDist(tokens))
    
all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)
all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)

all_df

STEP 6: Try Different Sentiment Analysis Tools¶

VADER¶

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def get_vader_score(review):
    return sid.polarity_scores(review)

all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)

def separate_vader_score(vader_score, key):
    return vader_score[key]

all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)
all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)
all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)
all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)

DIY SUMMARY¶

def get_weighted_freq_dist(review, freq_dist):
    max_freq = max(freq_dist.values())
    for word in freq_dist.keys():
        freq_dist[word] = (freq_dist[word]/max_freq)
    return freq_dist

all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)

def get_sentence_score(review, freq_dist):
    sentence_scores = {}
    for sent in review:
        for word in nltk.word_tokenize(sent.lower()):
            if word in freq_dist.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = freq_dist[word]
                    else:
                        sentence_scores[sent] += freq_dist[word]
    return sentence_scores

all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)

def get_summary_sentences(sentence_scores):
    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
    return ''.join(sent[0] for sent in sorted_sentences[:5])

all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)

summaries = all_df['summary_sentences'].tolist()

summaries[3]

'Todd Philips should maybe read some comics and don;t copy movies like taxi driver or similar.Bt this does still not make a great movie. What idiotic FIlm\nI can say that Phoenix is master actor.'

Doing VADER on the Summary Section¶

all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)

all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)
all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)
all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)
all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)

Doing VADER on the Most Frequent Words¶

def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])

all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)

all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)
all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)
all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)
all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)
all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)

STEP 7: Test `Step 6` with Machine Learning!!¶

Naive Bayes¶

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

def get_NB(small_df, labels):
    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    from sklearn import metrics
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

TEST 1: Vader Scores (Original)¶

small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu']) # 0.645
get_NB(small_df, all_df['PoN'])

Accuracy: 0.6621621621621622

TEST 2: Vader Scores (from Summary)¶

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) # 0.59
get_NB(small_df, all_df['PoN'])

Accuracy: 0.7027027027027027

TEST 3: Vader Scores (original) AND Vader Scores (summary)¶

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'])

Accuracy: 0.6891891891891891

TEST 4: Vader Scores (50 most frequent -- filtered -- words)¶

small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd']) # 0.598
get_NB(small_df, all_df['PoN'])

Accuracy: 0.7297297297297297

TEST 5: All `compound` Vader Scores¶

small_df = all_df.filter(['v_compound_fd','v_compound_sum', 'v_compound']) # 0.615
get_NB(small_df, all_df['PoN'])

Accuracy: 0.7027027027027027

TEST 6: ALL THE NUMBERS!!¶

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', 
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.613
get_NB(small_df, all_df['PoN'])

Accuracy: 0.7027027027027027

TEST 7: Test UNFILTERED most frequent words¶

def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])

all_df['v_freq_words_unfil'] = all_df.apply(lambda x: get_freq_words(x['freq_dist_unfil']), axis=1)

all_df['vader_fd_all_unfil'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words_unfil']),axis=1)

all_df['v_compound_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'compound'),axis=1)
all_df['v_neg_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neg'),axis=1)
all_df['v_neu_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neu'),axis=1)
all_df['v_pos_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'pos'),axis=1)

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', 
                          'v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf',
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'])

Accuracy: 0.7297297297297297

small_df = all_df.filter(['v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf']) # 0.603
get_NB(small_df, all_df['PoN'])

Accuracy: 0.7162162162162162

summaries_pos = all_df[all_df['PoN'] == 'P']
summaries_neg = all_df[all_df['PoN'] == 'N']

summaries_pos_list = summaries_pos['summary_sentences'].tolist()
summaries_neg_list = summaries_neg['summary_sentences'].tolist()

summaries_pos_list[:1]

['Just don\'t expect any "real" Batman references.Director Phillips delivers a film that looks and feels and sounds much different than other comic book movies.Cinematographer Lawrence Sher is a frequent Phillips collaborator (all 3 Hangover movies) and the dark look and gritty feel are present in most every shot.We are informed Arthur suffers from Pseudobulbar Affect, also known as emotional incontinence, which causes that creepy laughter to pop up at some inappropriate times.The "Smile" song is especially relevant as its origins can be traced by to Charlie Chaplin\'s MODERN TIMES, a silent movie classic featured in this film.This is not one for the younger kids, no matter how much they enjoy THE AVENGERS or WONDER WOMAN (or any other DC or Marvel film).']

summaries_neg_list[:1]

['']

summaries_neg_list[:1]

['']

### VERSION 1
#     all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
#     unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
#     sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
#     training_set = sentim_analyzer.apply_features(training_docs)
#     test_set = sentim_analyzer.apply_features(testing_docs)
sentim_analyzer = SentimentAnalyzer()

def get_nltk_negs(tokens):
    all_words_neg = sentim_analyzer.all_words([mark_negation(tokens)])
    return all_words_neg

def get_unigram_feats(neg_tokens):
    unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)
    return unigram_feats
    
def get_bigram_feats(tokens):
    ngrams = zip(*[tokens[i:] for i in range(2)])
    return ["_".join(ngram) for ngram in ngrams]

all_df['nltk_negs'] = all_df.apply(lambda x: get_nltk_negs(x['tokens']), axis=1)
all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)
all_df['bigram_feats'] = all_df.apply(lambda x: get_bigram_feats(x['tokens']), axis=1)
all_df['bigram_feats_neg'] = all_df.apply(lambda x: get_bigram_feats(x['nltk_negs']), axis=1)
all_df[:5]
# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)

### VERSION 2
#     all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
#     unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
#     sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
#     training_set = sentim_analyzer.apply_features(training_docs)
#     test_set = sentim_analyzer.apply_features(testing_docs)
sentim_analyzer = SentimentAnalyzer()

def get_nltk_data(tokens):
    neg_tokens = sentim_analyzer.all_words([mark_negation(tokens)])
    unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    return sentim_analyzer.apply_features(tokens)


# def get_unigram_feats(neg_tokens):
    
#     return unigram_feats
nltk_df = pd.DataFrame()
nltk_df['nltk_data'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)

# all_df['nltk']
# all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)
# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)

all_df['nltk_all'] = 0

all_df['nltk_all']

0      0
1      0
2      0
3      0
4      0
      ..
118    0
119    0
120    0
121    0
122    0
Name: nltk_all, Length: 246, dtype: int64

all_df[:3]

STEP 8: Add Bag of Words to Machine Learning models¶

from nltk.tokenize import casual_tokenize
from collections import Counter
all_df['bow_nosw'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)

all_df[:3]

# Keeping punctuation
# def diy_cleaner(review):
#     both = review.split('\n')
#     title = both[0]
#     review = both[1]
#     cleaned = title + '.' + title + '.' + review
#     return cleaned.lower()

# Removing punctuation

# def diy_cleaner(review):
#     both = review.split('\n')
#     title = both[0]
#     review = both[1]
#     review = review.replace("\'",'')
#     review = review.replace("'",'')
#     review = review.replace(",",'')
#     cleaned = title + ' ' + title + ' ' + ' '.join(review.split('.'))
#     return cleaned.lower()

import re, string

def diy_cleaner(review):
    both = review.split('\n')
    title = both[0]
    review = both[1]
    review = review.replace("'","")
    pattern = re.compile('[\W_]+')
    review = pattern.sub(' ', review)
    cleaned = title + ' ' + title + ' ' + review
    return cleaned.lower()

all_df['diy_cleaner'] = all_df.apply(lambda x: diy_cleaner(x[0]), axis=1)

all_df['diy_cleaner'][0].tolist()

[' missed opportunity  missed opportunity i had been very excited to see this movie ever since i had 1st heard about it and was anticipating its release but when it started getting the type of hype and press that it was i became a bit apprehensive as the things that were being said about it which seemed outrageous mostly were things that were said about other moves i had been looking forward to seeing and that had ended letting me down the joker sadly turned out to be one of those movies for me now as i know it has been said many times before phoenixs portrayal of man with mental health depression was very very good convincing but the story and the plot seemed like a big missed opportunity to deliver an anti hero origin story of one of the most coveted villains in comic book lore i liked the quasi mirroring of a failed system whose middle lower class revolt against the rich in a sort of if we burn you burn with us type of way that has been done many times before im sure people can see glimpses of that today in our us culture and who knows maybe the studio execs got their fingers involved and are the ones that watered it down a bit too much maybe there will be a directors cut that is darker than the theatrical release i certainly hope so because for all the reviews i have read that speak of people walking out of theaters because of the dark feel and psychological overtones being too much for them to stomach all i felt was a sense of wanting it to be bigger and grander than what i was seeing i would recommend seeing this movie so that you can formulate your own opinion but i gave it a low rating because when it was over i walked out feeling like i just saw a movie ive already seen many times before ',
 ' funny like a clown  funny like a clown greetings again from the darkness the first thing to know is that this is not a superhero movie in fact there are no heroes in the movie unless you would like to apply the label to a single mom who lives down the hall from arthur fleck mr fleck lives at home with his invalid mother in a grungy run down apartment he works as a clown for hire dreams of becoming a stand up comedian and depends on social services to supply the 7 medications he takes since being released from arkham state hospital its a bleak existence at a bleak time in a bleak city gotham is in the midst of a garbage workers strike only the super rats are happy political upheaval and a growing chasm between the classes and then it gets worse for arthur the second thing to know is that this is a standalone joker film and one mostly unrelated or not connected to previous projects featuring the colorful clown prince character played and voiced by such memorable actors as cesar romero jack nicholson heath ledger mark hamill jared leto and even zach galifianakis director todd phillips who co wrote the script with scott silver is best known for such extreme comedies as the hangover franchise and old school so hes a bit outside of his usual wheelhouse phillips and silver seem to embrace not just the history of the character but also the look texture and tone of filmmaking from an earlier era the gritty and outcast feel of scorceses taxi driver and the king of comedy is present and so are numerous tributes to familiar joker moments of days gone by three time oscar nominee joaquin phoenix plays arthur fleck and he delivers arthurs slow descent into madness or shall we say further descent its clear from the beginning that arthur views himself as ignored by society while all he wants to do is bring joy and laughter to others and be noticed his daydreams or visions of himself in a better world send a strong message phoenix shows us what full commitment to a role looks like he lost 50 pounds leaving a frame that contorts moves and dances in a manner unlike what weve seen before in fact its a toss up on which shows up more frequently his dances moves or his maniacal pained laughter we are informed arthur suffers from pseudobulbar affect also known as emotional incontinence which causes that creepy laughter to pop up at some inappropriate times of course the comparisons to heath ledgers oscar winning turn in the dark knight are inevitable the roles and films are written quite differently and its safe to say both actors were all in action sequences and special visual effects are both noticeably absent but the violence is sure to shock this is not one for the younger kids no matter how much they enjoy the avengers or wonder woman or any other dc or marvel film this gritty visceral approach is often a tough watch and is much more a character study of mental illness than a costume drama although arthurs clothes and make up are front and center when arthur states i have nothing but bad thoughts we believe him and the sympathetic back story explains a great deal and will likely prove quite controversial phoenix dominates the film as he should and supporting work is provided by robert de niro as murray franklin a tv talk show host in the johnny carson mode zazie beetz deadpool 2 as the single mom neighbor sophie dumond frances conroy as penny fleck arthurs mother brett cullen as a not so empathetic thomas wayne and shea whigham and bill camp as police detectives ill hesitantly mention that dante pereira olson makes a couple of brief appearances as an adolescent bruce wayne and just for fun we get a shot of the young man honing the batpole skills he will use later in life just dont expect any real batman references director phillips delivers a film that looks and feels and sounds much different than other comic book movies cinematographer lawrence sher is a frequent phillips collaborator all 3 hangover movies and the dark look and gritty feel are present in most every shot hildur guonadottir this years emmy winner for chernobyl serves up a foreboding score one that never overwhelms and one that contrasts perfectly with the more traditional songs utilized throughout stephen sondheims send in the clowns jimmy durante singing smile creams white room thats life by frank sinatra and gary glitters familiar rock and roll part 1 and 2 the smile song is especially relevant as its origins can be traced by to charlie chaplins modern times a silent movie classic featured in this film phillips even uses the saul bass designed warner bros logo to open the credits making sure we understand the time period no cell phones etc the film traces arthurs slide into crime a transition that he wasnt seeking and one that he believes was forced upon him his rise as a savior to the working class is secondary to his own journey and the chaos is handled on the perimeters of the film preventing this from becoming a super villain movie keep in mind joker played at venice telluride and toronto three prestigious festivals this is just another thing that sets it apart from others in the genre despite the 1981 time stamp the consistent anti rich message and class disparity is prevalent throughout this appears to be phillips way of including a contemporary theme in a decades old setting and its a cautionary tale that there should be no clown left behind ']

Create BOW using (1) `casual_tokenize` and (2) `Counter`¶

all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)

all_df[:3]

Get a Bag of Words from a column (for wordclouds etc!)¶

def get_bow_from_column(df, column):
    all_column_data = ' '.join(df[column].tolist())
    all_column_fd = Counter(all_column_data.split())
    return all_column_fd

Get BOW for all, BOW for positive, BOW for negative¶

big_bow = get_bow_from_column(all_df, 'diy_cleaner')
big_bow_n = get_bow_from_column(all_df[all_df['PoN'] == 'N'], 'diy_cleaner')
big_bow_p = get_bow_from_column(all_df[all_df['PoN'] == 'P'], 'diy_cleaner')

big_bow_n.most_common(10)

[('the', 902),
 ('a', 463),
 ('and', 422),
 ('to', 395),
 ('of', 375),
 ('is', 342),
 ('i', 274),
 ('it', 267),
 ('movie', 249),
 ('this', 247)]

big_bow_p.most_common(10)

[('the', 1144),
 ('a', 646),
 ('and', 582),
 ('of', 521),
 ('to', 473),
 ('is', 449),
 ('it', 320),
 ('that', 277),
 ('in', 271),
 ('this', 270)]

# Wow this is unhelpful. Removing words < 3 characters like Professor Gates does!

def pruner(review):
    clean_review = ' '.join([word for word in review.split() if len(word) > 3])
    return clean_review

all_df['pruned'] = all_df.apply(lambda x: pruner(x['diy_cleaner']), axis=1)

big_bow = get_bow_from_column(all_df, 'pruned')
big_bow_n = get_bow_from_column(all_df[all_df['PoN'] == 'N'], 'pruned')
big_bow_p = get_bow_from_column(all_df[all_df['PoN'] == 'P'], 'pruned')

big_bow.most_common(10)

[('this', 522),
 ('movie', 508),
 ('that', 493),
 ('joker', 440),
 ('film', 283),
 ('with', 249),
 ('from', 176),
 ('just', 169),
 ('phoenix', 159),
 ('character', 153)]

big_bow_n.most_common(10)

[('movie', 256),
 ('this', 250),
 ('that', 213),
 ('joker', 197),
 ('with', 103),
 ('just', 97),
 ('film', 79),
 ('from', 77),
 ('about', 75),
 ('character', 66)]

big_bow_p.most_common(10)

[('that', 280),
 ('this', 272),
 ('movie', 252),
 ('joker', 243),
 ('film', 204),
 ('with', 146),
 ('phoenix', 108),
 ('from', 99),
 ('character', 87),
 ('joaquin', 80)]

# bow_df = all_df['pruned']
# get_NB(small_df, all_df['PoN'])
all_df['pruned']

0      missed opportunity missed opportunity been ver...
1      phoenix's acting.. phoenix's acting.. dont thi...
2      everyone praised overrated movie. everyone pra...
3      what idiotic film what idiotic film that phoen...
4      terrible terrible only thing good about this m...
                             ...                        
118    nerve-wracking, very uncomfortable nerve-wrack...
119    solid film there glaring problems solid film t...
120    joker endgame joker endgame need more everythi...
121    absolutely absolutely strong fanboy hype rush ...
122    overhyped, it's alright overhyped, it's alrigh...
Name: pruned, Length: 246, dtype: object

all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)
new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]

get_NB(new_df, new_df.index)

Accuracy: 0.6486486486486487

new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]

get_NB(new_df, new_df.index)

Accuracy: 0.6621621621621622

new_df = pd.DataFrame(all_df['bow_nosw'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]

get_NB(new_df, new_df.index)

Accuracy: 0.6756756756756757

all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]

get_NB(new_df, new_df.index)

Accuracy: 0.6351351351351351

all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]

get_NB(new_df, new_df.index)

Accuracy: 0.581081081081081

all_bigrams = []
for review in all_df['bigram_feats']:
    for bigram in review:
        all_bigrams.append(bigram)

bigram_count = Counter(all_bigrams)

bigram_count.most_common(10)

[('of_the', 286),
 ('the_joker', 217),
 ('this_movie', 162),
 ('in_the', 157),
 ('the_movie', 154),
 ('the_film', 129),
 ('is_a', 124),
 ('to_be', 116),
 ('to_the', 113),
 ('joker_is', 98)]

Returning to Most Frequent Words¶

What can we learn from the intersection of "positive" and "negative" words? Essentially creating a new "stopword" list of "words that frequently occur in both lists

big_bow = get_bow_from_column(all_df, 'pruned')
big_bow_n = get_bow_from_column(all_df[all_df['PoN'] == 'N'], 'pruned')
big_bow_p = get_bow_from_column(all_df[all_df['PoN'] == 'P'], 'pruned')

most_common_neg = [word[0] for word in big_bow_n.most_common(100)]
most_common_neg[:10]

['movie',
 'this',
 'that',
 'joker',
 'with',
 'just',
 'film',
 'from',
 'about',
 'character']

most_common_pos = [word[0] for word in big_bow_p.most_common(100)]
most_common_pos[:10]

['that',
 'this',
 'movie',
 'joker',
 'film',
 'with',
 'phoenix',
 'from',
 'character',
 'joaquin']

import numpy as np
print("Unique values in array1 that are not in array2:")
neg_notpos = np.setdiff1d(most_common_neg, most_common_pos)
neg_notpos

Unique values in array1 that are not in array2:

array(['after', 'before', 'boring', 'characters', 'didnt', 'different',
       'down', 'everyone', 'felt', 'give', 'hype', 'interesting', 'look',
       'love', 'never', 'over', 'overrated', 'part', 'phoenixs',
       'reviews', 'same', 'their', 'them', 'then', 'things', 'want',
       'where', 'without'], dtype='<U11')

print("Unique values in array2 that are not in array1:")
pos_notneg = np.setdiff1d(most_common_pos, most_common_neg)
pos_notneg

Unique values in array2 that are not in array1:

array(['actor', 'arthur', 'best', 'book', 'cinema', 'comic', 'could',
       'director', 'ever', 'films', 'fleck', 'going', 'here', 'life',
       'makes', 'oscar', 'perfect', 'phillips', 'score', 'slow', 'take',
       'thing', 'those', 'todd', 'violence', 'while', 'world', 'your'],
      dtype='<U11')

print("Common values between two arrays:")
in_both = np.intersect1d(most_common_neg, most_common_pos)
print(len(in_both))
in_both

Common values between two arrays:
72

array(['about', 'acting', 'also', 'batman', 'because', 'been', 'being',
       'better', 'character', 'dark', 'does', 'doesnt', 'dont', 'driver',
       'even', 'feel', 'film', 'first', 'from', 'good', 'great', 'have',
       'heath', 'into', 'joaquin', 'joker', 'just', 'know', 'like',
       'made', 'make', 'many', 'masterpiece', 'mental', 'more', 'most',
       'movie', 'movies', 'much', 'nothing', 'only', 'origin', 'other',
       'people', 'performance', 'phoenix', 'plot', 'really', 'seen',
       'society', 'some', 'story', 'taxi', 'than', 'that', 'thats',
       'there', 'they', 'think', 'this', 'time', 'times', 'very', 'watch',
       'well', 'were', 'what', 'when', 'which', 'will', 'with', 'would'],
      dtype='<U11')

def get_common_words(num):
    most_common_neg = [word[0] for word in big_bow_n.most_common(num)]
    most_common_pos = [word[0] for word in big_bow_p.most_common(num)]
    in_both = np.intersect1d(most_common_neg, most_common_pos)
    neg_notpos = np.setdiff1d(most_common_neg, most_common_pos)
    pos_notneg = np.setdiff1d(most_common_pos, most_common_neg)
    return [len(in_both), len(neg_notpos), len(pos_notneg), len(in_both)/num, in_both, neg_notpos, pos_notneg]

common_100 = get_common_words(100)
common_200 = get_common_words(200)
common_300 = get_common_words(300)
common_500 = get_common_words(500)
common_1000 = get_common_words(1000)

common_100[:4]

[72, 28, 28, 0.72]

common_200[:4]

[141, 59, 59, 0.705]

common_300[:4]

[204, 96, 96, 0.68]

common_500[:4]

[308, 192, 192, 0.616]

common_1000[:4]

[565, 435, 435, 0.565]

Only tokenizing words that aren't in the 1000 most common¶

def get_only_polarized(tokens):
#     return [token for token in tokens if token not in common_1000[4]] # 66
#     return [token for token in tokens if token not in common_100[4]] # 70
    return [token for token in tokens if token not in common_500[4]] # 70

all_df['no_shared_words'] = all_df.apply(lambda x: get_only_polarized(x['tokens']), axis=1)

all_df[:5]

all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]

get_NB(new_df, new_df.index)

Accuracy: 0.7027027027027027

def get_only_polarized_v2(tokens):
#     return [token for token in tokens if token in common_1000[5]] # 54
#     return [token for token in tokens if token not in common_1000[5]] # 59
#     return [token for token in tokens if token not in common_1000[6]] # 60
    return [token for token in tokens if token not in common_1000[6]] # 60

all_df['no_neg_words'] = all_df.apply(lambda x: get_only_polarized_v2(x['tokens']), axis=1)

all_df['bow_v7'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_neg_words']))), axis=1)
new_df = pd.DataFrame(all_df['bow_v7'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]

get_NB(new_df, new_df.index)

Accuracy: 0.6081081081081081

	0	PoN
0	Missed Opportunity\nI had been very excited t...	N
1	5/5 for Phoenix's acting..\nI don't think the...	N
2	Everyone praised an overrated movie.\nOverrat...	N
3	What idiotic FIlm\nI can say that Phoenix is ...	N
4	Terrible\nThe only thing good about this movi...	N
...	...	...
118	Nerve-wracking, but in very uncomfortable way...	P
119	Solid film but there are glaring problems\nOk...	P
120	Joker > Endgame\nNeed I say more? Everything ...	P
121	Absolutely not a 10\nStrong fanboy and hype r...	P
122	Overhyped, but it's alright\nIt's a good film...	P

	0	PoN	sentences	num_sentences	tokens	num_tokens
0	Missed Opportunity\nI had been very excited t...	N	[ Missed Opportunity\nI had been very excited ...	1	[missed, opportunity, i, had, been, very, exci...	306
1	5/5 for Phoenix's acting..\nI don't think the...	N	[ 5/5 for Phoenix's acting.., I don't think th...	5	[for, phoenix, i, do, think, there, was, a, ne...	59
2	Everyone praised an overrated movie.\nOverrat...	N	[ Everyone praised an overrated movie., Overra...	2	[everyone, praised, an, overrated, movie, over...	26
3	What idiotic FIlm\nI can say that Phoenix is ...	N	[ What idiotic FIlm\nI can say that Phoenix is...	4	[what, idiotic, film, i, can, say, that, phoen...	66
4	Terrible\nThe only thing good about this movi...	N	[ Terrible\nThe only thing good about this mov...	9	[terrible, the, only, thing, good, about, this...	124
...	...	...	...	...	...	...
118	Nerve-wracking, but in very uncomfortable way...	P	[ Nerve-wracking, but in very uncomfortable wa...	8	[but, in, very, uncomfortable, way, why, every...	57
119	Solid film but there are glaring problems\nOk...	P	[ Solid film but there are glaring problems\nO...	13	[solid, film, but, there, are, glaring, proble...	628
120	Joker > Endgame\nNeed I say more? Everything ...	P	[ Joker > Endgame\nNeed I say more?, Everythin...	5	[joker, endgame, need, i, say, more, everythin...	83
121	Absolutely not a 10\nStrong fanboy and hype r...	P	[ Absolutely not a 10\nStrong fanboy and hype ...	5	[absolutely, not, a, strong, fanboy, and, hype...	81
122	Overhyped, but it's alright\nIt's a good film...	P	[ Overhyped, but it's alright\nIt's a good fil...	3	[overhyped, but, it, alright, it, a, good, fil...	60

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw
0	Missed Opportunity\nI had been very excited t...	N	[ Missed Opportunity\nI had been very excited ...	1	[missed, opportunity, i, had, been, very, exci...	306	[missed, opportunity, excited, see, movie, eve...	140
1	5/5 for Phoenix's acting..\nI don't think the...	N	[ 5/5 for Phoenix's acting.., I don't think th...	5	[for, phoenix, i, do, think, there, was, a, ne...	59	[phoenix, think, need, super, dark, film, tbh,...	25
2	Everyone praised an overrated movie.\nOverrat...	N	[ Everyone praised an overrated movie., Overra...	2	[everyone, praised, an, overrated, movie, over...	26	[everyone, praised, overrated, movie, overrate...	13
3	What idiotic FIlm\nI can say that Phoenix is ...	N	[ What idiotic FIlm\nI can say that Phoenix is...	4	[what, idiotic, film, i, can, say, that, phoen...	66	[idiotic, film, say, phoenix, master, actor, b...	36
4	Terrible\nThe only thing good about this movi...	N	[ Terrible\nThe only thing good about this mov...	9	[terrible, the, only, thing, good, about, this...	124	[terrible, thing, good, movie, phoenixs, actin...	65
...	...	...	...	...	...	...	...	...
118	Nerve-wracking, but in very uncomfortable way...	P	[ Nerve-wracking, but in very uncomfortable wa...	8	[but, in, very, uncomfortable, way, why, every...	57	[uncomfortable, way, everybody, keep, saying, ...	33
119	Solid film but there are glaring problems\nOk...	P	[ Solid film but there are glaring problems\nO...	13	[solid, film, but, there, are, glaring, proble...	628	[solid, film, glaring, problems, okay, first, ...	292
120	Joker > Endgame\nNeed I say more? Everything ...	P	[ Joker > Endgame\nNeed I say more?, Everythin...	5	[joker, endgame, need, i, say, more, everythin...	83	[joker, endgame, need, say, everything, movie,...	53
121	Absolutely not a 10\nStrong fanboy and hype r...	P	[ Absolutely not a 10\nStrong fanboy and hype ...	5	[absolutely, not, a, strong, fanboy, and, hype...	81	[absolutely, strong, fanboy, hype, rush, going...	36
122	Overhyped, but it's alright\nIt's a good film...	P	[ Overhyped, but it's alright\nIt's a good fil...	3	[overhyped, but, it, alright, it, a, good, fil...	60	[overhyped, alright, good, film, see, like, ma...	31

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw	topwords_unfil	topwords_fil	freq_dist	freq_dist_unfil
0	Missed Opportunity\nI had been very excited t...	N	[ Missed Opportunity\nI had been very excited ...	1	[missed, opportunity, i, had, been, very, exci...	306	[missed, opportunity, excited, see, movie, eve...	140	[(of, 13), (i, 12), (the, 12), (that, 10), (it...	[(movie, 3), (said, 3), (many, 3), (times, 3),...	{'missed': 2, 'opportunity': 2, 'excited': 1, ...	{'missed': 2, 'opportunity': 2, 'i': 12, 'had'...
1	5/5 for Phoenix's acting..\nI don't think the...	N	[ 5/5 for Phoenix's acting.., I don't think th...	5	[for, phoenix, i, do, think, there, was, a, ne...	59	[phoenix, think, need, super, dark, film, tbh,...	25	[(was, 4), (a, 3), (that, 3), (for, 2), (there...	[(dark, 2), (phoenix, 1), (think, 1), (need, 1...	{'phoenix': 1, 'think': 1, 'need': 1, 'super':...	{'for': 2, 'phoenix': 1, 'i': 1, 'do': 1, 'thi...
2	Everyone praised an overrated movie.\nOverrat...	N	[ Everyone praised an overrated movie., Overra...	2	[everyone, praised, an, overrated, movie, over...	26	[everyone, praised, overrated, movie, overrate...	13	[(overrated, 2), (movie, 2), (everyone, 1), (p...	[(overrated, 2), (movie, 2), (everyone, 1), (p...	{'everyone': 1, 'praised': 1, 'overrated': 2, ...	{'everyone': 1, 'praised': 1, 'an': 1, 'overra...
3	What idiotic FIlm\nI can say that Phoenix is ...	N	[ What idiotic FIlm\nI can say that Phoenix is...	4	[what, idiotic, film, i, can, say, that, phoen...	66	[idiotic, film, say, phoenix, master, actor, b...	36	[(and, 4), (is, 2), (make, 2), (movie, 2), (to...	[(make, 2), (movie, 2), (idiotic, 1), (film, 1...	{'idiotic': 1, 'film': 1, 'say': 1, 'phoenix':...	{'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '...
4	Terrible\nThe only thing good about this movi...	N	[ Terrible\nThe only thing good about this mov...	9	[terrible, the, only, thing, good, about, this...	124	[terrible, thing, good, movie, phoenixs, actin...	65	[(the, 5), (this, 4), (movie, 4), (it, 4), (to...	[(movie, 4), (terrible, 3), (acting, 3), (good...	{'terrible': 3, 'thing': 1, 'good': 2, 'movie'...	{'terrible': 3, 'the': 5, 'only': 1, 'thing': ...
...	...	...	...	...	...	...	...	...	...	...	...	...
118	Nerve-wracking, but in very uncomfortable way...	P	[ Nerve-wracking, but in very uncomfortable wa...	8	[but, in, very, uncomfortable, way, why, every...	57	[uncomfortable, way, everybody, keep, saying, ...	33	[(it, 4), (a, 4), (movie, 3), (in, 2), (keep, ...	[(movie, 3), (keep, 2), (saying, 2), (psycho, ...	{'uncomfortable': 1, 'way': 1, 'everybody': 1,...	{'but': 1, 'in': 2, 'very': 1, 'uncomfortable'...
119	Solid film but there are glaring problems\nOk...	P	[ Solid film but there are glaring problems\nO...	13	[solid, film, but, there, are, glaring, proble...	628	[solid, film, glaring, problems, okay, first, ...	292	[(the, 35), (to, 22), (it, 16), (and, 16), (i,...	[(joker, 6), (movie, 5), (film, 4), (like, 4),...	{'solid': 1, 'film': 4, 'glaring': 1, 'problem...	{'solid': 1, 'film': 4, 'but': 5, 'there': 3, ...
120	Joker > Endgame\nNeed I say more? Everything ...	P	[ Joker > Endgame\nNeed I say more?, Everythin...	5	[joker, endgame, need, i, say, more, everythin...	83	[joker, endgame, need, say, everything, movie,...	53	[(joker, 3), (movie, 3), (in, 3), (it, 3), (th...	[(joker, 3), (movie, 3), (masterful, 2), (awes...	{'joker': 3, 'endgame': 1, 'need': 1, 'say': 1...	{'joker': 3, 'endgame': 1, 'need': 1, 'i': 1, ...
121	Absolutely not a 10\nStrong fanboy and hype r...	P	[ Absolutely not a 10\nStrong fanboy and hype ...	5	[absolutely, not, a, strong, fanboy, and, hype...	81	[absolutely, strong, fanboy, hype, rush, going...	36	[(the, 7), (is, 6), (a, 4), (fanboy, 2), (and,...	[(fanboy, 2), (movie, 2), (absolutely, 1), (st...	{'absolutely': 1, 'strong': 1, 'fanboy': 2, 'h...	{'absolutely': 1, 'not': 1, 'a': 4, 'strong': ...
122	Overhyped, but it's alright\nIt's a good film...	P	[ Overhyped, but it's alright\nIt's a good fil...	3	[overhyped, but, it, alright, it, a, good, fil...	60	[overhyped, alright, good, film, see, like, ma...	31	[(it, 4), (but, 3), (a, 3), (good, 2), (do, 2)...	[(good, 2), (overhyped, 1), (alright, 1), (fil...	{'overhyped': 1, 'alright': 1, 'good': 2, 'fil...	{'overhyped': 1, 'but': 3, 'it': 4, 'alright':...

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw	topwords_unfil	topwords_fil	...	unigram_feats	bow_nosw	bow_v2	diy_cleaner	pruned	bow_v3	bow_v4	bigram_feats	bigram_feats_neg
0	Missed Opportunity\nI had been very excited t...	N	[ Missed Opportunity\nI had been very excited ...	1	[missed, opportunity, i, had, been, very, exci...	306	[missed, opportunity, excited, see, movie, eve...	140	[(of, 13), (i, 12), (the, 12), (that, 10), (it...	[(movie, 3), (said, 3), (many, 3), (times, 3),...	...	[of, i, the, that, it, a, and, to, was, had, b...	{'Missed': 1, 'Opportunity': 1, 'I': 14, 'had'...	{'missed': 3, 'opportunity': 3, 'i': 14, 'had'...	missed opportunity missed opportunity i had ...	missed opportunity missed opportunity been ver...	{'missed': 3, 'opportunity': 3, 'been': 4, 've...	{'missed': 2, 'opportunity': 2, 'i': 12, 'had'...	[missed_opportunity, opportunity_i, i_had, had...	[missed_opportunity, opportunity_i, i_had, had...
1	5/5 for Phoenix's acting..\nI don't think the...	N	[ 5/5 for Phoenix's acting.., I don't think th...	5	[for, phoenix, i, do, think, there, was, a, ne...	59	[phoenix, think, need, super, dark, film, tbh,...	25	[(was, 4), (a, 3), (that, 3), (for, 2), (there...	[(dark, 2), (phoenix, 1), (think, 1), (need, 1...	...	[a, for, was, dark, was_NEG, that_NEG, phoenix...	{'5/5': 1, 'for': 2, 'Phoenix's': 1, 'acting':...	{'5/5': 2, 'for': 3, 'phoenix's': 2, 'acting':...	5/5 for phoenix's acting.. 5/5 for phoenix's...	phoenix's acting.. phoenix's acting.. dont thi...	{'phoenix's': 2, 'acting': 2, '..': 2, 'dont':...	{'for': 2, 'phoenix': 1, 'i': 1, 'do': 1, 'thi...	[for_phoenix, phoenix_i, i_do, do_think, think...	[for_phoenix, phoenix_i, i_do, do_think, think...
2	Everyone praised an overrated movie.\nOverrat...	N	[ Everyone praised an overrated movie., Overra...	2	[everyone, praised, an, overrated, movie, over...	26	[everyone, praised, overrated, movie, overrate...	13	[(overrated, 2), (movie, 2), (everyone, 1), (p...	[(overrated, 2), (movie, 2), (everyone, 1), (p...	...	[overrated, movie, everyone, praised, an, of, ...	{'Everyone': 1, 'praised': 1, 'an': 1, 'overra...	{'everyone': 2, 'praised': 2, 'an': 2, 'overra...	everyone praised an overrated movie. everyon...	everyone praised overrated movie. everyone pra...	{'everyone': 2, 'praised': 2, 'overrated': 3, ...	{'everyone': 1, 'praised': 1, 'an': 1, 'overra...	[everyone_praised, praised_an, an_overrated, o...	[everyone_praised, praised_an, an_overrated, o...
3	What idiotic FIlm\nI can say that Phoenix is ...	N	[ What idiotic FIlm\nI can say that Phoenix is...	4	[what, idiotic, film, i, can, say, that, phoen...	66	[idiotic, film, say, phoenix, master, actor, b...	36	[(and, 4), (is, 2), (make, 2), (movie, 2), (to...	[(make, 2), (movie, 2), (idiotic, 1), (film, 1...	...	[and_NEG, make_NEG, movie_NEG, to_NEG, with_NE...	{'What': 1, 'idiotic': 1, 'FIlm': 1, 'I': 1, '...	{'what': 2, 'idiotic': 2, 'film': 2, 'i': 1, '...	what idiotic film what idiotic film i can sa...	what idiotic film what idiotic film that phoen...	{'what': 2, 'idiotic': 2, 'film': 2, 'that': 1...	{'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '...	[what_idiotic, idiotic_film, film_i, i_can, ca...	[what_idiotic, idiotic_film, film_i, i_can, ca...
4	Terrible\nThe only thing good about this movi...	N	[ Terrible\nThe only thing good about this mov...	9	[terrible, the, only, thing, good, about, this...	124	[terrible, thing, good, movie, phoenixs, actin...	65	[(the, 5), (this, 4), (movie, 4), (it, 4), (to...	[(movie, 4), (terrible, 3), (acting, 3), (good...	...	[it_NEG, the_NEG, to_NEG, and_NEG, for_NEG, th...	{'Terrible': 1, 'The': 2, 'only': 1, 'thing': ...	{'terrible': 4, 'the': 5, 'only': 1, 'thing': ...	terrible terrible the only thing good about ...	terrible terrible only thing good about this m...	{'terrible': 4, 'only': 1, 'thing': 1, 'good':...	{'terrible': 1, 'the': 1, 'only': 1, 'thing': ...	[terrible_the, the_only, only_thing, thing_goo...	[terrible_the, the_only, only_thing, thing_goo...

	missed	opportunity	been	very	excited	this	movie	ever	since	heard	...	information	follows	stress	overlook	offers	easy	answers	alike	company	acceptable
PoN
N	3	3	4	3	1	2	3	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	1	3	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	2	2	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	4	4	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	been	very	excited	to	see	this	...	overlook	>	offers	easy	answers	non-fans	alike	company	til	acceptable
PoN
N	3	3	14	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	2	0	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	2	0	0	0	0	4	0	4	...	0	0	0	0	0	0	0	0	0	0

	Missed	Opportunity	I	had	been	very	excited	to	see	this	...	Strong	iq	Deff	comparable	knight	compensate	company	up.Movie	til	acceptable
PoN
N	1	1	14	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	2	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	4	0	4	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	been	very	excited	to	see	this	...	paced_NEG	annoying_NEG	deff_NEG	comparable_NEG	compensate_NEG	marketing	company	climax	strange	acceptable
PoN
N	2	2	12	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0

	missed_opportunity	opportunity_i	i_had	had_been	been_very	very_excited	excited_to	to_see	see_this	this_movie	...	slow_do	happen_the	middle_and	climax_cinematography	joker_acting	fine_if	forced_at	just_strange	strange_but	but_acceptable
PoN
N	2	1	3	2	1	1	1	1	1	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	4	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	excited	to	see	heard	it	and	...	overlook	easy	answers	alike	iq	deff	comparable	compensate	company	acceptable
PoN
N	2	2	12	4	1	6	2	1	9	7	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	1	1	0	2	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	1	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	2	0	0	0	4	...	0	0	0	0	0	0	0	0	0	0
N	0	0	2	0	0	4	0	0	4	3	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	been	very	excited	to	see	this	...	overlook	easy	answers	alike	iq	deff	comparable	compensate	company	acceptable
PoN
N	2	2	12	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	2	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	2	0	0	0	0	4	0	4	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	been	very	excited	this	movie	ever	since	heard	...	information	follows	stress	overlook	offers	easy	answers	alike	company	acceptable
PoN
N	3	3	4	3	1	2	3	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	1	3	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	2	2	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	4	4	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	been	very	excited	to	see	this	...	overlook	>	offers	easy	answers	non-fans	alike	company	til	acceptable
PoN
N	3	3	14	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	2	0	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	2	0	0	0	0	4	0	4	...	0	0	0	0	0	0	0	0	0	0

	Missed	Opportunity	I	had	been	very	excited	to	see	this	...	Strong	iq	Deff	comparable	knight	compensate	company	up.Movie	til	acceptable
PoN
N	1	1	14	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	2	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	4	0	4	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	been	very	excited	to	see	this	...	paced_NEG	annoying_NEG	deff_NEG	comparable_NEG	compensate_NEG	marketing	company	climax	strange	acceptable
PoN
N	2	2	12	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0

	missed_opportunity	opportunity_i	i_had	had_been	been_very	very_excited	excited_to	to_see	see_this	this_movie	...	slow_do	happen_the	middle_and	climax_cinematography	joker_acting	fine_if	forced_at	just_strange	strange_but	but_acceptable
PoN
N	2	1	3	2	1	1	1	1	1	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	4	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	excited	to	see	heard	it	and	...	overlook	easy	answers	alike	iq	deff	comparable	compensate	company	acceptable
PoN
N	2	2	12	4	1	6	2	1	9	7	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	1	1	0	2	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	1	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	2	0	0	0	4	...	0	0	0	0	0	0	0	0	0	0
N	0	0	2	0	0	4	0	0	4	3	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	been	very	excited	to	see	this	...	overlook	easy	answers	alike	iq	deff	comparable	compensate	company	acceptable
PoN
N	2	2	12	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	2	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	2	0	0	0	0	4	0	4	...	0	0	0	0	0	0	0	0	0	0

HW2: VECTORIZATION (Pandas style!)¶

STEP 1: Import ALL the things¶

Import libraries¶

Import data from files¶

STEP 2: Prep Data¶

STEP 2a: Turn that fresh text into a pandas DF¶

STEP 2b: Label it¶

STEP 2c: Combine the dfs¶

STEP 3: TOKENIZE (and clean)!!¶

STEP 4: Remove Stopwords¶

STEP 5: Create a Frequency Distribution¶

STEP 6: Try Different Sentiment Analysis Tools¶

VADER¶

DIY SUMMARY¶

Doing VADER on the Summary Section¶

Doing VADER on the Most Frequent Words¶

STEP 7: Test Step 6 with Machine Learning!!¶

Naive Bayes¶

TEST 1: Vader Scores (Original)¶

TEST 2: Vader Scores (from Summary)¶

TEST 3: Vader Scores (original) AND Vader Scores (summary)¶

TEST 4: Vader Scores (50 most frequent -- filtered -- words)¶

TEST 5: All compound Vader Scores¶

TEST 6: ALL THE NUMBERS!!¶

TEST 7: Test UNFILTERED most frequent words¶

STEP 8: Add Bag of Words to Machine Learning models¶

Create BOW using (1) casual_tokenize and (2) Counter¶

Get a Bag of Words from a column (for wordclouds etc!)¶

Get BOW for all, BOW for positive, BOW for negative¶

Returning to Most Frequent Words¶

Only tokenizing words that aren't in the 1000 most common¶

STEP 7: Test `Step 6` with Machine Learning!!¶

TEST 5: All `compound` Vader Scores¶

Create BOW using (1) `casual_tokenize` and (2) `Counter`¶

	missed	opportunity	been	very	excited	this	movie	ever	since	heard	...	information	follows	stress	overlook	offers	easy	answers	alike	company	acceptable
PoN
N	3	3	4	3	1	2	3	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	1	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	1	3	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	2	2	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	4	4	0	0	0	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	been	very	excited	to	see	this	...	overlook	>	offers	easy	answers	non-fans	alike	company	til	acceptable
PoN
N	3	3	14	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	2	0	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	2	0	0	0	0	4	0	4	...	0	0	0	0	0	0	0	0	0	0

	Missed	Opportunity	I	had	been	very	excited	to	see	this	...	Strong	iq	Deff	comparable	knight	compensate	company	up.Movie	til	acceptable
PoN
N	1	1	14	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	2	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	4	0	4	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	been	very	excited	to	see	this	...	paced_NEG	annoying_NEG	deff_NEG	comparable_NEG	compensate_NEG	marketing	company	climax	strange	acceptable
PoN
N	2	2	12	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	0	0	1	...	0	0	0	0	0	0	0	0	0	0

	missed_opportunity	opportunity_i	i_had	had_been	been_very	very_excited	excited_to	to_see	see_this	this_movie	...	slow_do	happen_the	middle_and	climax_cinematography	joker_acting	fine_if	forced_at	just_strange	strange_but	but_acceptable
PoN
N	2	1	3	2	1	1	1	1	1	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	4	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	excited	to	see	heard	it	and	...	overlook	easy	answers	alike	iq	deff	comparable	compensate	company	acceptable
PoN
N	2	2	12	4	1	6	2	1	9	7	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	1	1	0	2	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	1	0	0	1	0	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	2	0	0	0	4	...	0	0	0	0	0	0	0	0	0	0
N	0	0	2	0	0	4	0	0	4	3	...	0	0	0	0	0	0	0	0	0	0

	missed	opportunity	i	had	been	very	excited	to	see	this	...	overlook	easy	answers	alike	iq	deff	comparable	compensate	company	acceptable
PoN
N	2	2	12	4	4	3	1	6	2	2	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	1	0	0	1	1	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	1	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	1	0	0	0	0	2	0	1	...	0	0	0	0	0	0	0	0	0	0
N	0	0	2	0	0	0	0	4	0	4	...	0	0	0	0	0	0	0	0	0	0