HW2: VECTORIZATION (Pandas style!)¶

STEP 1: Import ALL the things¶

Import libraries¶

##########################################
# NOTE: I'm toying with the idea of requiring the library just above 
# when I use it so it makes more sense in context
##########################################
# import os
# import pandas as pd
# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.sentiment import SentimentAnalyzer
# from nltk.sentiment.util import *
# from nltk.probability import FreqDist
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sid = SentimentIntensityAnalyzer()

Import data from files¶

import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

# neg = get_data_from_files('../neg_cornell/')
# pos = get_data_from_files('../pos_cornell/')

neg = get_data_from_files('../hw4_lie_false/')
pos = get_data_from_files('../hw4_lie_true/')

# neg = get_data_from_files('../neg_hw4/')
# pos = get_data_from_files('../pos_hw4/')

STEP 2: Prep Data¶

STEP 2a: Turn that fresh text into a pandas DF¶

import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)

STEP 2b: Label it¶

pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'

STEP 2c: Combine the dfs¶

all_df = neg_df.append(pos_df)

all_df

STEP 3: TOKENIZE (and clean)!!¶

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

## Came back and added sentences for tokinization for "Summary experiment"
def get_sentence_tokens(review):
    return sent_tokenize(review)
    
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)

all_df

STEP 4: Remove Stopwords¶

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)

all_df

STEP 5: Create a Frequency Distribution¶

from nltk.probability import FreqDist
def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)
all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)

def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)
all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)

def get_fdist(tokens):
    return (FreqDist(tokens))
    
all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)
all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)

all_df

STEP 6: Try Different Sentiment Analysis Tools¶

VADER¶

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def get_vader_score(review):
    return sid.polarity_scores(review)

all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)

def separate_vader_score(vader_score, key):
    return vader_score[key]

all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)
all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)
all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)
all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)

DIY SUMMARY¶

all_df[0][17]

17    I went to Joeys and had the best lasagna on th...
17    Halos is home. I have been here numerous times...
Name: 0, dtype: object

def get_weighted_freq_dist(review, freq_dist):
    try:
        max_freq = max(freq_dist.values())
        for word in freq_dist.keys():
            freq_dist[word] = (freq_dist[word]/max_freq)
        return freq_dist
    except:
        return 'nope'

all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)

def get_sentence_score(review, freq_dist):
    sentence_scores = {}
    for sent in review:
        for word in nltk.word_tokenize(sent.lower()):
            if word in freq_dist.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = freq_dist[word]
                    else:
                        sentence_scores[sent] += freq_dist[word]
    return sentence_scores

all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)

def get_summary_sentences(sentence_scores):
    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
    return ''.join(sent[0] for sent in sorted_sentences[:5])

all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)

summaries = all_df['summary_sentences'].tolist()

summaries[3]

'You dont need choose your menu by yourself and they would accord to your customer to help you choose the best menu for you.When you enter the restaurant you would feel that you drop into a new world.The menu consisted by 16 courses and everyone is amazing taste.It is a France restaurant which has Michelin three stars.The dining would spend 3 hours.'

Doing VADER on the Summary Section¶

all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)

all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)
all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)
all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)
all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)

Doing VADER on the Most Frequent Words¶

def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])

all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)

all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)
all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)
all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)
all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)
all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)

STEP 7: Test `Step 6` with Machine Learning!!¶

Naive Bayes¶

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

def get_NB(small_df, labels):
    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    from sklearn import metrics
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

TEST 1: Vader Scores (Original)¶

small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu']) # 0.645
get_NB(small_df, all_df['PoN'])

Accuracy: 0.39285714285714285

TEST 2: Vader Scores (from Summary)¶

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) # 0.59
get_NB(small_df, all_df['PoN'])

Accuracy: 0.39285714285714285

TEST 3: Vader Scores (original) AND Vader Scores (summary)¶

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'])

Accuracy: 0.39285714285714285

TEST 4: Vader Scores (50 most frequent -- filtered -- words)¶

small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd']) # 0.598
get_NB(small_df, all_df['PoN'])

Accuracy: 0.4642857142857143

TEST 5: All `compound` Vader Scores¶

small_df = all_df.filter(['v_compound_fd','v_compound_sum', 'v_compound']) # 0.615
get_NB(small_df, all_df['PoN'])

Accuracy: 0.39285714285714285

TEST 6: ALL THE NUMBERS!!¶

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', 
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.613
get_NB(small_df, all_df['PoN'])

Accuracy: 0.42857142857142855

TEST 7: Test UNFILTERED most frequent words¶

def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])

all_df['v_freq_words_unfil'] = all_df.apply(lambda x: get_freq_words(x['freq_dist_unfil']), axis=1)

all_df['vader_fd_all_unfil'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words_unfil']),axis=1)

all_df['v_compound_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'compound'),axis=1)
all_df['v_neg_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neg'),axis=1)
all_df['v_neu_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neu'),axis=1)
all_df['v_pos_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'pos'),axis=1)

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', 
                          'v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf',
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'])

Accuracy: 0.42857142857142855

small_df = all_df.filter(['v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf']) # 0.603
get_NB(small_df, all_df['PoN'])

Accuracy: 0.32142857142857145

summaries_pos = all_df[all_df['PoN'] == 'P']
summaries_neg = all_df[all_df['PoN'] == 'N']

summaries_pos_list = summaries_pos['summary_sentences'].tolist()
summaries_neg_list = summaries_neg['summary_sentences'].tolist()

STEP 8: Test NLTK: Naive Bayes from HW1¶

from nltk.classify import NaiveBayesClassifier
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

def get_nltk_train_test(array, label, num_train):
    tokens = [get_tokens(sentence) for sentence in array]
    docs = [(sent, label) for sent in tokens]
    train_docs = docs[:num_train]
    test_docs = docs[num_train:len(array)]
    return [train_docs, test_docs]


def get_nltk_NB(NEG_DATA, POS_DATA, num_train):
    train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg', num_train)
    train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos', num_train)

    training_docs = train_neg + train_pos
    testing_docs = test_neg + test_pos

    sentim_analyzer = SentimentAnalyzer()
    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
    training_set = sentim_analyzer.apply_features(training_docs)
    test_set = sentim_analyzer.apply_features(testing_docs)

    trainer = NaiveBayesClassifier.train
    classifier = sentim_analyzer.train(trainer, training_set)
    
    results = []
    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):
        print('{0}: {1}'.format(key,value))

neg_df = all_df[all_df['PoN'] == 'N']
neg_df_list = neg_df[0].tolist()

pos_df = all_df[all_df['PoN'] == 'P']
pos_df_list = pos_df[0].tolist()

get_nltk_NB(neg_df_list, pos_df_list, 32)

Training classifier
Evaluating NaiveBayesClassifier results...
Accuracy: 0.5714285714285714
F-measure [neg]: 0.5714285714285714
F-measure [pos]: 0.5714285714285714
Precision [neg]: 0.5714285714285714
Precision [pos]: 0.5714285714285714
Recall [neg]: 0.5714285714285714
Recall [pos]: 0.5714285714285714

	0	PoN
0	Gannon’s Isle Ice Cream served the best ice cr...	N
1	Hibachi the grill is one of my favorite restau...	N
2	RIM KAAP One of the best Thai restaurants in t...	N
3	It is a France restaurant which has Michelin t...	N
4	Its hard to pick a favorite dining experience ...	N
...	...	...
41	A big piano is in the middle of the lobby and ...	P
42	Cant say too much about it. Just try it buddy!...	P
43	I once went to a restaurant which was not very...	P
44	The worst restaurant experience of my life hap...	P
45	I ordered the food and took it back home when ...	P

	0	PoN	sentences	num_sentences	tokens	num_tokens
0	Gannon’s Isle Ice Cream served the best ice cr...	N	[Gannon’s Isle Ice Cream served the best ice c...	7	[gannon, s, isle, ice, cream, served, the, bes...	72
1	Hibachi the grill is one of my favorite restau...	N	[Hibachi the grill is one of my favorite resta...	5	[hibachi, the, grill, is, one, of, my, favorit...	65
2	RIM KAAP One of the best Thai restaurants in t...	N	[RIM KAAP One of the best Thai restaurants in ...	11	[rim, kaap, one, of, the, best, thai, restaura...	141
3	It is a France restaurant which has Michelin t...	N	[It is a France restaurant which has Michelin ...	6	[it, is, a, france, restaurant, which, has, mi...	71
4	Its hard to pick a favorite dining experience ...	N	[Its hard to pick a favorite dining experience...	7	[its, hard, to, pick, a, favorite, dining, exp...	119
...	...	...	...	...	...	...
41	A big piano is in the middle of the lobby and ...	P	[A big piano is in the middle of the lobby and...	4	[a, big, piano, is, in, the, middle, of, the, ...	56
42	Cant say too much about it. Just try it buddy!...	P	[Cant say too much about it., Just try it budd...	3	[cant, say, too, much, about, it, just, try, i...	15
43	I once went to a restaurant which was not very...	P	[I once went to a restaurant which was not ver...	5	[i, once, went, to, a, restaurant, which, was,...	74
44	The worst restaurant experience of my life hap...	P	[The worst restaurant experience of my life ha...	8	[the, worst, restaurant, experience, of, my, l...	128
45	I ordered the food and took it back home when ...	P	[I ordered the food and took it back home when...	3	[i, ordered, the, food, and, took, it, back, h...	50

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw
0	Gannon’s Isle Ice Cream served the best ice cr...	N	[Gannon’s Isle Ice Cream served the best ice c...	7	[gannon, s, isle, ice, cream, served, the, bes...	72	[gannon, isle, ice, cream, served, best, ice, ...	36
1	Hibachi the grill is one of my favorite restau...	N	[Hibachi the grill is one of my favorite resta...	5	[hibachi, the, grill, is, one, of, my, favorit...	65	[hibachi, grill, one, favorite, restaurants, l...	30
2	RIM KAAP One of the best Thai restaurants in t...	N	[RIM KAAP One of the best Thai restaurants in ...	11	[rim, kaap, one, of, the, best, thai, restaura...	141	[rim, kaap, one, best, thai, restaurants, town...	75
3	It is a France restaurant which has Michelin t...	N	[It is a France restaurant which has Michelin ...	6	[it, is, a, france, restaurant, which, has, mi...	71	[france, restaurant, michelin, three, stars, e...	36
4	Its hard to pick a favorite dining experience ...	N	[Its hard to pick a favorite dining experience...	7	[its, hard, to, pick, a, favorite, dining, exp...	119	[hard, pick, favorite, dining, experience, cio...	62
...	...	...	...	...	...	...	...	...
41	A big piano is in the middle of the lobby and ...	P	[A big piano is in the middle of the lobby and...	4	[a, big, piano, is, in, the, middle, of, the, ...	56	[big, piano, middle, lobby, music, light, soft...	24
42	Cant say too much about it. Just try it buddy!...	P	[Cant say too much about it., Just try it budd...	3	[cant, say, too, much, about, it, just, try, i...	15	[cant, say, much, try, buddy, youll, regret, d...	8
43	I once went to a restaurant which was not very...	P	[I once went to a restaurant which was not ver...	5	[i, once, went, to, a, restaurant, which, was,...	74	[went, restaurant, clean, worst, worst, thing,...	34
44	The worst restaurant experience of my life hap...	P	[The worst restaurant experience of my life ha...	8	[the, worst, restaurant, experience, of, my, l...	128	[worst, restaurant, experience, life, happened...	75
45	I ordered the food and took it back home when ...	P	[I ordered the food and took it back home when...	3	[i, ordered, the, food, and, took, it, back, h...	50	[ordered, food, took, back, home, opened, back...	23

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw	topwords_unfil	topwords_fil	freq_dist	freq_dist_unfil
0	Gannon’s Isle Ice Cream served the best ice cr...	N	[Gannon’s Isle Ice Cream served the best ice c...	7	[gannon, s, isle, ice, cream, served, the, bes...	72	[gannon, isle, ice, cream, served, best, ice, ...	36	[(the, 7), (it, 4), (ice, 3), (cream, 3), (and...	[(ice, 3), (cream, 3), (best, 2), (chocolate, ...	{'gannon': 1, 'isle': 1, 'ice': 3, 'cream': 3,...	{'gannon': 1, 's': 1, 'isle': 1, 'ice': 3, 'cr...
1	Hibachi the grill is one of my favorite restau...	N	[Hibachi the grill is one of my favorite resta...	5	[hibachi, the, grill, is, one, of, my, favorit...	65	[hibachi, grill, one, favorite, restaurants, l...	30	[(the, 8), (is, 6), (it, 3), (hibachi, 2), (gr...	[(hibachi, 2), (grill, 2), (restaurants, 2), (...	{'hibachi': 2, 'grill': 2, 'one': 1, 'favorite...	{'hibachi': 2, 'the': 8, 'grill': 2, 'is': 6, ...
2	RIM KAAP One of the best Thai restaurants in t...	N	[RIM KAAP One of the best Thai restaurants in ...	11	[rim, kaap, one, of, the, best, thai, restaura...	141	[rim, kaap, one, best, thai, restaurants, town...	75	[(the, 12), (of, 3), (thai, 3), (and, 3), (for...	[(thai, 3), (food, 3), (rim, 2), (kaap, 2), (b...	{'rim': 2, 'kaap': 2, 'one': 1, 'best': 2, 'th...	{'rim': 2, 'kaap': 2, 'one': 1, 'of': 3, 'the'...
3	It is a France restaurant which has Michelin t...	N	[It is a France restaurant which has Michelin ...	6	[it, is, a, france, restaurant, which, has, mi...	71	[france, restaurant, michelin, three, stars, e...	36	[(you, 7), (the, 5), (would, 3), (to, 3), (you...	[(would, 3), (menu, 3), (restaurant, 2), (choo...	{'france': 1, 'restaurant': 2, 'michelin': 1, ...	{'it': 1, 'is': 2, 'a': 2, 'france': 1, 'resta...
4	Its hard to pick a favorite dining experience ...	N	[Its hard to pick a favorite dining experience...	7	[its, hard, to, pick, a, favorite, dining, exp...	119	[hard, pick, favorite, dining, experience, cio...	62	[(the, 7), (to, 4), (is, 4), (my, 3), (and, 3)...	[(food, 2), (italy, 2), (mother, 2), (hard, 1)...	{'hard': 1, 'pick': 1, 'favorite': 1, 'dining'...	{'its': 2, 'hard': 1, 'to': 4, 'pick': 1, 'a':...
...	...	...	...	...	...	...	...	...	...	...	...	...
41	A big piano is in the middle of the lobby and ...	P	[A big piano is in the middle of the lobby and...	4	[a, big, piano, is, in, the, middle, of, the, ...	56	[big, piano, middle, lobby, music, light, soft...	24	[(is, 7), (the, 3), (that, 3), (and, 2), (so, ...	[(feel, 2), (big, 1), (piano, 1), (middle, 1),...	{'big': 1, 'piano': 1, 'middle': 1, 'lobby': 1...	{'a': 1, 'big': 1, 'piano': 1, 'is': 7, 'in': ...
42	Cant say too much about it. Just try it buddy!...	P	[Cant say too much about it., Just try it budd...	3	[cant, say, too, much, about, it, just, try, i...	15	[cant, say, much, try, buddy, youll, regret, d...	8	[(it, 2), (cant, 1), (say, 1), (too, 1), (much...	[(cant, 1), (say, 1), (much, 1), (try, 1), (bu...	{'cant': 1, 'say': 1, 'much': 1, 'try': 1, 'bu...	{'cant': 1, 'say': 1, 'too': 1, 'much': 1, 'ab...
43	I once went to a restaurant which was not very...	P	[I once went to a restaurant which was not ver...	5	[i, once, went, to, a, restaurant, which, was,...	74	[went, restaurant, clean, worst, worst, thing,...	34	[(the, 5), (i, 4), (to, 4), (a, 3), (not, 3), ...	[(dinner, 3), (would, 3), (worst, 2), (time, 2...	{'went': 1, 'restaurant': 1, 'clean': 1, 'wors...	{'i': 4, 'once': 1, 'went': 1, 'to': 4, 'a': 3...
44	The worst restaurant experience of my life hap...	P	[The worst restaurant experience of my life ha...	8	[the, worst, restaurant, experience, of, my, l...	128	[worst, restaurant, experience, life, happened...	75	[(the, 6), (and, 4), (a, 3), (i, 3), (with, 3)...	[(chicken, 3), (wasnt, 2), (much, 2), (waitres...	{'worst': 1, 'restaurant': 1, 'experience': 1,...	{'the': 6, 'worst': 1, 'restaurant': 1, 'exper...
45	I ordered the food and took it back home when ...	P	[I ordered the food and took it back home when...	3	[i, ordered, the, food, and, took, it, back, h...	50	[ordered, food, took, back, home, opened, back...	23	[(i, 6), (it, 4), (the, 3), (to, 3), (back, 2)...	[(back, 2), (restaurant, 2), (ordered, 1), (fo...	{'ordered': 1, 'food': 1, 'took': 1, 'back': 2...	{'i': 6, 'ordered': 1, 'the': 3, 'food': 1, 'a...

HW2: VECTORIZATION (Pandas style!)¶

STEP 1: Import ALL the things¶

Import libraries¶

Import data from files¶

STEP 2: Prep Data¶

STEP 2a: Turn that fresh text into a pandas DF¶

STEP 2b: Label it¶

STEP 2c: Combine the dfs¶

STEP 3: TOKENIZE (and clean)!!¶

STEP 4: Remove Stopwords¶

STEP 5: Create a Frequency Distribution¶

STEP 6: Try Different Sentiment Analysis Tools¶

VADER¶

DIY SUMMARY¶

Doing VADER on the Summary Section¶

Doing VADER on the Most Frequent Words¶

STEP 7: Test Step 6 with Machine Learning!!¶

Naive Bayes¶

TEST 1: Vader Scores (Original)¶

TEST 2: Vader Scores (from Summary)¶

TEST 3: Vader Scores (original) AND Vader Scores (summary)¶

TEST 4: Vader Scores (50 most frequent -- filtered -- words)¶

TEST 5: All compound Vader Scores¶

TEST 6: ALL THE NUMBERS!!¶

TEST 7: Test UNFILTERED most frequent words¶

STEP 8: Test NLTK: Naive Bayes from HW1¶

STEP 7: Test `Step 6` with Machine Learning!!¶

TEST 5: All `compound` Vader Scores¶