HW2: VECTORIZATION (Pandas style!)¶

STEP 1: Import ALL the things¶

Import libraries¶

##########################################
# NOTE: I'm toying with the idea of requiring the library just above 
# when I use it so it makes more sense in context
##########################################
# import os
# import pandas as pd
# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.sentiment import SentimentAnalyzer
# from nltk.sentiment.util import *
# from nltk.probability import FreqDist
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sid = SentimentIntensityAnalyzer()

Import data from files¶

import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

neg = get_data_from_files('../neg_cornell/')
pos = get_data_from_files('../pos_cornell/')

STEP 2: Prep Data¶

STEP 2a: Turn that fresh text into a pandas DF¶

import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)

STEP 2b: Label it¶

pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'

STEP 2c: Combine the dfs¶

all_df = neg_df.append(pos_df)

all_df

STEP 3: TOKENIZE (and clean)!!¶

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

## Came back and added sentences for tokinization for "Summary experiment"
def get_sentence_tokens(review):
    return sent_tokenize(review)
    
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)

all_df

STEP 4: Remove Stopwords¶

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)

all_df

STEP 5: Create a Frequency Distribution¶

from nltk.probability import FreqDist
def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)
all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)

def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)
all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)

def get_fdist(tokens):
    return (FreqDist(tokens))
    
all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)
all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)

all_df

STEP 6: Try Different Sentiment Analysis Tools¶

VADER¶

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def get_vader_score(review):
    return sid.polarity_scores(review)

all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)

def separate_vader_score(vader_score, key):
    return vader_score[key]

all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)
all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)
all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)
all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)

DIY SUMMARY¶

def get_weighted_freq_dist(review, freq_dist):
    max_freq = max(freq_dist.values())
    for word in freq_dist.keys():
        freq_dist[word] = (freq_dist[word]/max_freq)
    return freq_dist

all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)

def get_sentence_score(review, freq_dist):
    sentence_scores = {}
    for sent in review:
        for word in nltk.word_tokenize(sent.lower()):
            if word in freq_dist.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = freq_dist[word]
                    else:
                        sentence_scores[sent] += freq_dist[word]
    return sentence_scores

all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)

def get_summary_sentences(sentence_scores):
    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
    return ''.join(sent[0] for sent in sorted_sentences[:5])

all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)

summaries = all_df['summary_sentences'].tolist()

summaries[3]

"cell-phones ring every five minutes , and everyone hurriedly rushes along , leaving marginal time for the frustrated viewer to relate to the sisters' issues and problems .i figured i needed to get in touch with my feminine side , and `hanging up' seemed like an ideal opportunity to do so .ryan's convincing performance and diverting cuteness are two of the more agreeable aspects of `hanging up' .it's certainly a far cry from what one would label as a rewarding experience , but `hanging up' should have at least been enjoyable .maddy ( kudrow ) , the soap opera actress , spends time either contemplating her possible path to stardom or nursing her dog ."

Doing VADER on the Summary Section¶

all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)

all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)
all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)
all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)
all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)

Doing VADER on the Most Frequent Words¶

def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])

all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)

all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)
all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)
all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)
all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)
all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)

STEP 7: Test `Step 6` with Machine Learning!!¶

Naive Bayes¶

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

def get_NB(small_df, labels):
    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    from sklearn import metrics
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

TEST 1: Vader Scores (Original)¶

small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu']) # 0.645
get_NB(small_df, all_df['PoN'])

Accuracy: 0.645

TEST 2: Vader Scores (from Summary)¶

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) # 0.59
get_NB(small_df, all_df['PoN'])

Accuracy: 0.59

TEST 3: Vader Scores (original) AND Vader Scores (summary)¶

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'])

Accuracy: 0.6183333333333333

TEST 4: Vader Scores (50 most frequent -- filtered -- words)¶

small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd']) # 0.598
get_NB(small_df, all_df['PoN'])

Accuracy: 0.5983333333333334

TEST 5: All `compound` Vader Scores¶

small_df = all_df.filter(['v_compound_fd','v_compound_sum', 'v_compound']) # 0.615
get_NB(small_df, all_df['PoN'])

Accuracy: 0.615

TEST 6: ALL THE NUMBERS!!¶

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', 
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.613
get_NB(small_df, all_df['PoN'])

Accuracy: 0.6133333333333333

TEST 7: Test UNFILTERED most frequent words¶

def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])

all_df['v_freq_words_unfil'] = all_df.apply(lambda x: get_freq_words(x['freq_dist_unfil']), axis=1)

all_df['vader_fd_all_unfil'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words_unfil']),axis=1)

all_df['v_compound_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'compound'),axis=1)
all_df['v_neg_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neg'),axis=1)
all_df['v_neu_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neu'),axis=1)
all_df['v_pos_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'pos'),axis=1)

small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', 
                          'v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf',
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'])

Accuracy: 0.62

small_df = all_df.filter(['v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf']) # 0.603
get_NB(small_df, all_df['PoN'])

Accuracy: 0.6033333333333334

	0	PoN
0	bad . bad . \nbad . \nthat one word seems to p...	N
1	isn't it the ultimate sign of a movie's cinema...	N
2	" gordy " is not a movie , it is a 90-minute-...	N
3	disconnect the phone line . \ndon't accept the...	N
4	when robert forster found himself famous again...	N
...	...	...
995	one of the funniest carry on movies and the th...	P
996	i remember making a pact , right after `patch ...	P
997	barely scrapping by playing at a nyc piano bar...	P
998	if the current trends of hollywood filmmaking ...	P
999	capsule : the director of cure brings a weird ...	P

	0	PoN	sentences	num_sentences	tokens	num_tokens
0	bad . bad . \nbad . \nthat one word seems to p...	N	[bad ., bad ., bad ., that one word seems to p...	67	[bad, bad, bad, that, one, word, seems, to, pr...	1071
1	isn't it the ultimate sign of a movie's cinema...	N	[isn't it the ultimate sign of a movie's cinem...	32	[is, it, the, ultimate, sign, of, a, movie, ci...	553
2	" gordy " is not a movie , it is a 90-minute-...	N	[ " gordy " is not a movie , it is a 90-minute...	23	[gordy, is, not, a, movie, it, is, a, sesame, ...	478
3	disconnect the phone line . \ndon't accept the...	N	[disconnect the phone line ., don't accept the...	37	[disconnect, the, phone, line, do, accept, the...	604
4	when robert forster found himself famous again...	N	[when robert forster found himself famous agai...	29	[when, robert, forster, found, himself, famous...	386
...	...	...	...	...	...	...
995	one of the funniest carry on movies and the th...	P	[one of the funniest carry on movies and the t...	25	[one, of, the, funniest, carry, on, movies, an...	434
996	i remember making a pact , right after `patch ...	P	[i remember making a pact , right after `patch...	40	[i, remember, making, a, pact, right, after, p...	652
997	barely scrapping by playing at a nyc piano bar...	P	[barely scrapping by playing at a nyc piano ba...	23	[barely, scrapping, by, playing, at, a, nyc, p...	345
998	if the current trends of hollywood filmmaking ...	P	[if the current trends of hollywood filmmaking...	34	[if, the, current, trends, of, hollywood, film...	730
999	capsule : the director of cure brings a weird ...	P	[capsule : the director of cure brings a weird...	45	[capsule, the, director, of, cure, brings, a, ...	641

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw
0	bad . bad . \nbad . \nthat one word seems to p...	N	[bad ., bad ., bad ., that one word seems to p...	67	[bad, bad, bad, that, one, word, seems, to, pr...	1071	[bad, bad, bad, one, word, seems, pretty, much...	515
1	isn't it the ultimate sign of a movie's cinema...	N	[isn't it the ultimate sign of a movie's cinem...	32	[is, it, the, ultimate, sign, of, a, movie, ci...	553	[ultimate, sign, movie, cinematic, ineptitude,...	297
2	" gordy " is not a movie , it is a 90-minute-...	N	[ " gordy " is not a movie , it is a 90-minute...	23	[gordy, is, not, a, movie, it, is, a, sesame, ...	478	[gordy, movie, sesame, street, skit, bad, one,...	239
3	disconnect the phone line . \ndon't accept the...	N	[disconnect the phone line ., don't accept the...	37	[disconnect, the, phone, line, do, accept, the...	604	[disconnect, phone, line, accept, charges, any...	323
4	when robert forster found himself famous again...	N	[when robert forster found himself famous agai...	29	[when, robert, forster, found, himself, famous...	386	[robert, forster, found, famous, appearing, ja...	185
...	...	...	...	...	...	...	...	...
995	one of the funniest carry on movies and the th...	P	[one of the funniest carry on movies and the t...	25	[one, of, the, funniest, carry, on, movies, an...	434	[one, funniest, carry, movies, third, medical,...	241
996	i remember making a pact , right after `patch ...	P	[i remember making a pact , right after `patch...	40	[i, remember, making, a, pact, right, after, p...	652	[remember, making, pact, right, patch, adams, ...	361
997	barely scrapping by playing at a nyc piano bar...	P	[barely scrapping by playing at a nyc piano ba...	23	[barely, scrapping, by, playing, at, a, nyc, p...	345	[barely, scrapping, playing, nyc, piano, bar, ...	177
998	if the current trends of hollywood filmmaking ...	P	[if the current trends of hollywood filmmaking...	34	[if, the, current, trends, of, hollywood, film...	730	[current, trends, hollywood, filmmaking, conti...	428
999	capsule : the director of cure brings a weird ...	P	[capsule : the director of cure brings a weird...	45	[capsule, the, director, of, cure, brings, a, ...	641	[capsule, director, cure, brings, weird, compl...	340

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw	topwords_unfil	topwords_fil	freq_dist	freq_dist_unfil
0	bad . bad . \nbad . \nthat one word seems to p...	N	[bad ., bad ., bad ., that one word seems to p...	67	[bad, bad, bad, that, one, word, seems, to, pr...	1071	[bad, bad, bad, one, word, seems, pretty, much...	515	[(the, 60), (a, 35), (to, 34), (of, 24), (this...	[(movie, 17), (bad, 8), (one, 7), (meyer, 6), ...	{'bad': 8, 'one': 7, 'word': 1, 'seems': 1, 'p...	{'bad': 8, 'that': 19, 'one': 7, 'word': 1, 's...
1	isn't it the ultimate sign of a movie's cinema...	N	[isn't it the ultimate sign of a movie's cinem...	32	[is, it, the, ultimate, sign, of, a, movie, ci...	553	[ultimate, sign, movie, cinematic, ineptitude,...	297	[(the, 28), (a, 18), (of, 16), (to, 14), (i, 1...	[(movie, 7), (one, 6), (first, 5), (much, 4), ...	{'ultimate': 1, 'sign': 1, 'movie': 7, 'cinema...	{'is': 11, 'it': 11, 'the': 28, 'ultimate': 1,...
2	" gordy " is not a movie , it is a 90-minute-...	N	[ " gordy " is not a movie , it is a 90-minute...	23	[gordy, is, not, a, movie, it, is, a, sesame, ...	478	[gordy, movie, sesame, street, skit, bad, one,...	239	[(the, 25), (and, 21), (to, 18), (is, 17), (a,...	[(gordy, 8), (movie, 5), (one, 4), (stupid, 4)...	{'gordy': 8, 'movie': 5, 'sesame': 1, 'street'...	{'gordy': 8, 'is': 17, 'not': 3, 'a': 17, 'mov...
3	disconnect the phone line . \ndon't accept the...	N	[disconnect the phone line ., don't accept the...	37	[disconnect, the, phone, line, do, accept, the...	604	[disconnect, phone, line, accept, charges, any...	323	[(the, 41), (of, 17), (a, 17), (to, 16), (and,...	[(hanging, 9), (sisters, 5), (ryan, 4), (time,...	{'disconnect': 1, 'phone': 2, 'line': 1, 'acce...	{'disconnect': 1, 'the': 41, 'phone': 2, 'line...
4	when robert forster found himself famous again...	N	[when robert forster found himself famous agai...	29	[when, robert, forster, found, himself, famous...	386	[robert, forster, found, famous, appearing, ja...	185	[(the, 21), (it, 11), (i, 10), (to, 10), (of, ...	[(film, 5), (movie, 5), (american, 4), (perfek...	{'robert': 2, 'forster': 3, 'found': 1, 'famou...	{'when': 2, 'robert': 2, 'forster': 3, 'found'...
...	...	...	...	...	...	...	...	...	...	...	...	...
995	one of the funniest carry on movies and the th...	P	[one of the funniest carry on movies and the t...	25	[one, of, the, funniest, carry, on, movies, an...	434	[one, funniest, carry, movies, third, medical,...	241	[(the, 26), (and, 21), (of, 11), (a, 10), (is,...	[(nookey, 9), (hawtrey, 5), (carry, 4), (dr, 4...	{'one': 1, 'funniest': 1, 'carry': 4, 'movies'...	{'one': 1, 'of': 11, 'the': 26, 'funniest': 1,...
996	i remember making a pact , right after `patch ...	P	[i remember making a pact , right after `patch...	40	[i, remember, making, a, pact, right, after, p...	652	[remember, making, pact, right, patch, adams, ...	361	[(the, 44), (of, 29), (and, 19), (a, 15), (it,...	[(music, 8), (heart, 7), (craven, 6), (movie, ...	{'remember': 1, 'making': 1, 'pact': 1, 'right...	{'i': 1, 'remember': 1, 'making': 1, 'a': 15, ...
997	barely scrapping by playing at a nyc piano bar...	P	[barely scrapping by playing at a nyc piano ba...	23	[barely, scrapping, by, playing, at, a, nyc, p...	345	[barely, scrapping, playing, nyc, piano, bar, ...	177	[(a, 23), (is, 16), (the, 13), (and, 10), (of,...	[(like, 4), (hutton, 3), (old, 3), (high, 2), ...	{'barely': 1, 'scrapping': 1, 'playing': 1, 'n...	{'barely': 1, 'scrapping': 1, 'by': 2, 'playin...
998	if the current trends of hollywood filmmaking ...	P	[if the current trends of hollywood filmmaking...	34	[if, the, current, trends, of, hollywood, film...	730	[current, trends, hollywood, filmmaking, conti...	428	[(the, 49), (of, 31), (and, 19), (in, 18), (to...	[(one, 7), (like, 5), (l, 5), (hollywood, 4), ...	{'current': 1, 'trends': 1, 'hollywood': 4, 'f...	{'if': 1, 'the': 49, 'current': 1, 'trends': 1...
999	capsule : the director of cure brings a weird ...	P	[capsule : the director of cure brings a weird...	45	[capsule, the, director, of, cure, brings, a, ...	641	[capsule, director, cure, brings, weird, compl...	340	[(the, 33), (to, 28), (and, 21), (a, 18), (of,...	[(computer, 11), (kurosawa, 8), (one, 5), (see...	{'capsule': 1, 'director': 1, 'cure': 3, 'brin...	{'capsule': 1, 'the': 33, 'director': 1, 'of':...

HW2: VECTORIZATION (Pandas style!)¶

STEP 1: Import ALL the things¶

Import libraries¶

Import data from files¶

STEP 2: Prep Data¶

STEP 2a: Turn that fresh text into a pandas DF¶

STEP 2b: Label it¶

STEP 2c: Combine the dfs¶

STEP 3: TOKENIZE (and clean)!!¶

STEP 4: Remove Stopwords¶

STEP 5: Create a Frequency Distribution¶

STEP 6: Try Different Sentiment Analysis Tools¶

VADER¶

DIY SUMMARY¶

Doing VADER on the Summary Section¶

Doing VADER on the Most Frequent Words¶

STEP 7: Test Step 6 with Machine Learning!!¶

Naive Bayes¶

TEST 1: Vader Scores (Original)¶

TEST 2: Vader Scores (from Summary)¶

TEST 3: Vader Scores (original) AND Vader Scores (summary)¶

TEST 4: Vader Scores (50 most frequent -- filtered -- words)¶

TEST 5: All compound Vader Scores¶

TEST 6: ALL THE NUMBERS!!¶

TEST 7: Test UNFILTERED most frequent words¶

STEP 7: Test `Step 6` with Machine Learning!!¶

TEST 5: All `compound` Vader Scores¶