HW2xHW4

VECTORIZATION (Pandas style!)

STEP 1: Import ALL the things

Import libraries

In [1]:
##########################################
# NOTE: I'm toying with the idea of requiring the library just above 
# when I use it so it makes more sense in context
##########################################
# import os
# import pandas as pd
# from nltk.tokenize import word_tokenize, sent_tokenize
# from nltk.sentiment import SentimentAnalyzer
# from nltk.sentiment.util import *
# from nltk.probability import FreqDist
# from nltk.sentiment.vader import SentimentIntensityAnalyzer
# sid = SentimentIntensityAnalyzer()

Import data from files

In [166]:
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

# neg = get_data_from_files('../neg_cornell/')
# pos = get_data_from_files('../pos_cornell/')

# neg = get_data_from_files('../neg_hw4/')
# pos = get_data_from_files('../pos_hw4/')

pos = get_data_from_files('../hw4_lie_false/')
neg = get_data_from_files('../hw4_lie_true/')

STEP 2: Prep Data

STEP 2a: Turn that fresh text into a pandas DF

In [167]:
import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)

STEP 2b: Label it

In [168]:
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'

STEP 2c: Combine the dfs

In [169]:
all_df = neg_df.append(pos_df)
In [170]:
all_df[:3]
Out[170]:
0 PoN
0 ? N
1 Twin Trees Cicero NY HUGE salad bar and high q... N
2 The worst restaurant that I have ever eaten in... N

STEP 3: TOKENIZE (and clean)!!

In [171]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
In [172]:
## Came back and added sentences for tokinization for "Summary experiment"
def get_sentence_tokens(review):
    return sent_tokenize(review)
    
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)
In [173]:
def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)
In [174]:
all_df[:3]
Out[174]:
0 PoN sentences num_sentences tokens num_tokens
0 ? N [?] 1 [] 0
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105

STEP 4: Remove Stopwords

In [175]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)
In [176]:
all_df[:5]
Out[176]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw
0 ? N [?] 1 [] 0 [] 0
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49
3 ? N [?] 1 [] 0 [] 0
4 I have been to a Asian restaurant in New York ... N [I have been to a Asian restaurant in New York... 4 [i, have, been, to, a, asian, restaurant, in, ... 45 [asian, restaurant, new, york, city, menu, wri... 23

STEP 5: Create a Frequency Distribution

In [177]:
from nltk.probability import FreqDist
def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)
all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)
In [178]:
def get_most_common(tokens):
    fdist = FreqDist(tokens)
    return fdist.most_common(12)
all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)
In [179]:
def get_fdist(tokens):
    return (FreqDist(tokens))
    
all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)
all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)
In [180]:
all_df[:3]
Out[180]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw topwords_unfil topwords_fil freq_dist freq_dist_unfil
0 ? N [?] 1 [] 0 [] 0 [] [] {} {}
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [(and, 3), (to, 3), (are, 2), (the, 2), (twin,... [(twin, 1), (trees, 1), (cicero, 1), (ny, 1), ... {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [(the, 6), (i, 6), (a, 5), (was, 5), (and, 4),... [(pepper, 3), (veggie, 2), (sandwich, 2), (red... {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... {'the': 6, 'worst': 1, 'restaurant': 1, 'that'...

STEP 6: Try Different Sentiment Analysis Tools

VADER

In [181]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
def get_vader_score(review):
    return sid.polarity_scores(review)

all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)
In [182]:
def separate_vader_score(vader_score, key):
    return vader_score[key]

all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)
all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)
all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)
all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)

DIY SUMMARY

In [183]:
all_df[0][17]
Out[183]:
17    Halos is home. I have been here numerous times...
17    I went to Joeys and had the best lasagna on th...
Name: 0, dtype: object
In [184]:
def get_weighted_freq_dist(review, freq_dist):
    try:
        max_freq = max(freq_dist.values())
        for word in freq_dist.keys():
            freq_dist[word] = (freq_dist[word]/max_freq)
        return freq_dist
    except:
        return 'nope'

all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)
In [185]:
def get_sentence_score(review, freq_dist):
    sentence_scores = {}
    for sent in review:
        for word in nltk.word_tokenize(sent.lower()):
            if word in freq_dist.keys():
                if len(sent.split(' ')) < 30:
                    if sent not in sentence_scores.keys():
                        sentence_scores[sent] = freq_dist[word]
                    else:
                        sentence_scores[sent] += freq_dist[word]
    return sentence_scores

all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)
In [186]:
def get_summary_sentences(sentence_scores):
    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)
    return ''.join(sent[0] for sent in sorted_sentences[:5])

all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)
In [187]:
summaries = all_df['summary_sentences'].tolist()
In [188]:
summaries[3]
Out[188]:
''

Doing VADER on the Summary Section

In [189]:
all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)
In [190]:
all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)
all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)
all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)
all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)

Doing VADER on the Most Frequent Words

In [191]:
def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])

all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)

all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)
all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)
all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)
all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)
all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)

STEP 7: Test Step 6 with Machine Learning!!

Naive Bayes

In [192]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import metrics

def get_NB(small_df, labels, no_negs):
    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)


    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    
    if no_negs:
        mnnb = MultinomialNB()
        mnnb.fit(x_train, y_train)
        y_pred_mn = mnnb.predict(x_test)
    
    print("Accuracy GNB:", metrics.accuracy_score(y_test, y_pred))
    if no_negs: 
        print("Accuracy MNNB:", metrics.accuracy_score(y_test, y_pred_mn))
In [193]:
# from sklearn.naive_bayes import MultinomialNB
# clf = MultinomialNB()
# clf.fit(x_train, y_train)

# print(clf.predict(x_train[2:3]))

TEST 1: Vader Scores (Original)

In [194]:
small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu']) # 0.645
get_NB(small_df, all_df['PoN'], False)
Accuracy GNB: 0.5
In [195]:
small_df = all_df.filter(['v_pos', 'v_neu']) # 0.645
get_NB(small_df, all_df['PoN'], True)
Accuracy GNB: 0.35714285714285715
Accuracy MNNB: 0.6428571428571429

TEST 2: Vader Scores (from Summary)

In [196]:
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) # 0.59
get_NB(small_df, all_df['PoN'], False)
Accuracy GNB: 0.6071428571428571
In [197]:
small_df = all_df.filter(['v_pos_sum','v_neu_sum']) # 0.59
get_NB(small_df, all_df['PoN'], True)
Accuracy GNB: 0.5714285714285714
Accuracy MNNB: 0.5357142857142857

TEST 3: Vader Scores (original) AND Vader Scores (summary)

In [198]:
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'], False)
Accuracy GNB: 0.5714285714285714
In [199]:
small_df = all_df.filter(['v_pos_sum', 'v_neu_sum', 'v_pos', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'], True)
Accuracy GNB: 0.5
Accuracy MNNB: 0.6071428571428571

TEST 4: Vader Scores (50 most frequent -- filtered -- words)

In [200]:
small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd']) # 0.598
get_NB(small_df, all_df['PoN'], False)
Accuracy GNB: 0.6428571428571429
In [201]:
small_df = all_df.filter(['v_pos_fd', 'v_neu_fd']) # 0.598
get_NB(small_df, all_df['PoN'], True)
Accuracy GNB: 0.5714285714285714
Accuracy MNNB: 0.6071428571428571

TEST 5: All compound Vader Scores

In [202]:
small_df = all_df.filter(['v_compound_fd','v_compound_sum', 'v_compound']) # 0.615
get_NB(small_df, all_df['PoN'], False)
Accuracy GNB: 0.6071428571428571
In [203]:
small_df = all_df.filter(['v_pos_fd','v_pos_sum', 'v_pos']) # 0.615
get_NB(small_df, all_df['PoN'], True)
Accuracy GNB: 0.6428571428571429
Accuracy MNNB: 0.42857142857142855

TEST 6: ALL THE NUMBERS!!

In [204]:
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', 
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.613
get_NB(small_df, all_df['PoN'], False)
Accuracy GNB: 0.6071428571428571

TEST 7: Test UNFILTERED most frequent words

In [205]:
def get_freq_words(freq_dist):
    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)
    return ' '.join(word[0] for word in sorted_words[:50])

all_df['v_freq_words_unfil'] = all_df.apply(lambda x: get_freq_words(x['freq_dist_unfil']), axis=1)

all_df['vader_fd_all_unfil'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words_unfil']),axis=1)

all_df['v_compound_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'compound'),axis=1)
all_df['v_neg_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neg'),axis=1)
all_df['v_neu_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neu'),axis=1)
all_df['v_pos_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'pos'),axis=1)
In [206]:
small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', 
                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', 
                          'v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf',
                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618
get_NB(small_df, all_df['PoN'], False)
Accuracy GNB: 0.6071428571428571
In [207]:
small_df = all_df.filter(['v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf']) # 0.603
get_NB(small_df, all_df['PoN'], False)
Accuracy GNB: 0.5357142857142857
In [208]:
summaries_pos = all_df[all_df['PoN'] == 'P']
summaries_neg = all_df[all_df['PoN'] == 'N']
In [209]:
summaries_pos_list = summaries_pos['summary_sentences'].tolist()
summaries_neg_list = summaries_neg['summary_sentences'].tolist()
In [210]:
summaries_pos_list[:1]
Out[210]:
['Gannon’s Isle Ice Cream served the best ice cream and you better believe it!A weird combination but the smooth sweet chocolate combined with the sharp taste of raspberry was devine!The ice cream is delicious the best I had.The place is ideally situated and it is easy to get too.There were so many varieties that I had trouble choosing it.']
In [211]:
summaries_neg_list[:1]
Out[211]:
['']
In [212]:
### VERSION 1
#     all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
#     unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
#     sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
#     training_set = sentim_analyzer.apply_features(training_docs)
#     test_set = sentim_analyzer.apply_features(testing_docs)
sentim_analyzer = SentimentAnalyzer()

def get_nltk_negs(tokens):
    all_words_neg = sentim_analyzer.all_words([mark_negation(tokens)])
    return all_words_neg

def get_unigram_feats(neg_tokens):
    unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)
    return unigram_feats
    
all_df['nltk_negs'] = all_df.apply(lambda x: get_nltk_negs(x['tokens']), axis=1)
all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)
# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)
In [213]:
### VERSION 2
#     all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])
#     unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)
#     sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
#     training_set = sentim_analyzer.apply_features(training_docs)
#     test_set = sentim_analyzer.apply_features(testing_docs)
sentim_analyzer = SentimentAnalyzer()

def get_nltk_data(tokens):
#     print(tokens)
    neg_tokens = sentim_analyzer.all_words([mark_negation(tokens)])
    unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)
    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)
#     print(sentim_analyzer.apply_features(tokens))
    return sentim_analyzer.apply_features(tokens)


# def get_unigram_feats(neg_tokens):
    
#     return unigram_feats
nltk_df = pd.DataFrame()
nltk_df['nltk_data'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)

# all_df['nltk']
# all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)
# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)
In [214]:
# all_df['nltk_all'] = 0
In [215]:
nltk_df
Out[215]:
nltk_data
0 ()
1 ({'contains(was)': False, 'contains(i)': True,...
2 ({'contains(was)': False, 'contains(i)': False...
3 ()
4 ({'contains(was)': False, 'contains(i)': True,...
... ...
41 ({'contains(was)': False, 'contains(i)': True,...
42 ({'contains(was)': False, 'contains(i)': False...
43 ({'contains(was)': False, 'contains(i)': True,...
44 ({'contains(was)': False, 'contains(i)': False...
45 ({'contains(was)': False, 'contains(i)': True,...

92 rows × 1 columns

In [216]:
all_df['nltk_negs']
Out[216]:
0                                                    []
1     [twin, trees, cicero, ny, huge, salad, bar, an...
2     [the, worst, restaurant, that, i, have, ever, ...
3                                                    []
4     [i, have, been, to, a, asian, restaurant, in, ...
                            ...                        
41    [mikes, pizza, high, point, ny, service, was, ...
42    [after, i, went, shopping, with, some, of, my,...
43    [i, entered, the, restaurant, and, a, waitress...
44    [carlos, plate, shack, was, the, worst, dining...
45    [olive, oil, garden, was, very, disappointing,...
Name: nltk_negs, Length: 92, dtype: object
In [217]:
from nltk.tokenize import casual_tokenize
from collections import Counter
all_df['bow_nosw'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)
In [218]:
all_df[:3]
Out[218]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw topwords_unfil topwords_fil ... v_pos_fd v_freq_words_unfil vader_fd_all_unfil v_compound_fd_uf v_neg_fd_uf v_neu_fd_uf v_pos_fd_uf nltk_negs unigram_feats bow_nosw
0 ? N [?] 1 [] 0 [] 0 [] [] ... 0.000 {'neg': 0.0, 'neu': 0.0, 'pos': 0.0, 'compound... 0.0000 0.000 0.000 0.000 [] [] {'?': 1}
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [(and, 3), (to, 3), (are, 2), (the, 2), (twin,... [(twin, 1), (trees, 1), (cicero, 1), (ny, 1), ... ... 0.212 and to are the twin trees cicero ny huge salad... {'neg': 0.0, 'neu': 0.842, 'pos': 0.158, 'comp... 0.7951 0.000 0.842 0.158 [twin, trees, cicero, ny, huge, salad, bar, an... [and, to, are, the, twin, trees, cicero, ny, h... {'Twin': 1, 'Trees': 1, 'Cicero': 1, 'NY': 1, ...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [(the, 6), (i, 6), (a, 5), (was, 5), (and, 4),... [(pepper, 3), (veggie, 2), (sandwich, 2), (red... ... 0.059 the i a was and to pepper my veggie sandwich r... {'neg': 0.081, 'neu': 0.882, 'pos': 0.036, 'co... -0.3182 0.081 0.882 0.036 [the, worst, restaurant, that, i, have, ever, ... [the, a, i, was_NEG, i_NEG, to_NEG, and, veggi... {'The': 1, 'worst': 1, 'restaurant': 1, 'that'...

3 rows × 40 columns

In [ ]:
 
In [ ]: