HW3 JOKER EXTREMES

STEP 1: Import Data

In [1]:
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

neg = get_data_from_files('../NEG_JK_E/')
pos = get_data_from_files('../POS_JK_E/')
In [11]:
print('Neg Reviews:', len(neg))
print('Pos Reviews:', len(pos))
Neg Reviews: 48
Pos Reviews: 50

STEP 2: Turn into DF & Label it

In [12]:
import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)

Add labels

In [14]:
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
In [15]:
all_df = neg_df.append(pos_df)
In [16]:
all_df[:5]
Out[16]:
0 PoN
0 Everyone praised an overrated movie.\nOverrat... N
1 What idiotic FIlm\nI can say that Phoenix is ... N
2 Terrible\nThe only thing good about this movi... N
3 Watch Taxi Driver instead\nThis is a poor att... N
4 I learned one thing.\nIt borrows a lot of ele... N

STEP 3: Tokenize and Clean!

In [17]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
In [18]:
def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)
In [20]:
all_df[:5]
Out[20]:
0 PoN tokens num_tokens
0 Everyone praised an overrated movie.\nOverrat... N [everyone, praised, an, overrated, movie, over... 26
1 What idiotic FIlm\nI can say that Phoenix is ... N [what, idiotic, film, i, can, say, that, phoen... 66
2 Terrible\nThe only thing good about this movi... N [terrible, the, only, thing, good, about, this... 124
3 Watch Taxi Driver instead\nThis is a poor att... N [watch, taxi, driver, instead, this, is, a, po... 123
4 I learned one thing.\nIt borrows a lot of ele... N [i, learned, one, thing, it, borrows, a, lot, ... 70

STEP 4: Create Bag of Words

In [23]:
from nltk.tokenize import casual_tokenize
from collections import Counter
# all_df['bow'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)
all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)
In [24]:
all_df[:5]
Out[24]:
0 PoN tokens num_tokens bow
0 Everyone praised an overrated movie.\nOverrat... N [everyone, praised, an, overrated, movie, over... 26 {'everyone': 1, 'praised': 1, 'an': 1, 'overra...
1 What idiotic FIlm\nI can say that Phoenix is ... N [what, idiotic, film, i, can, say, that, phoen... 66 {'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '...
2 Terrible\nThe only thing good about this movi... N [terrible, the, only, thing, good, about, this... 124 {'terrible': 3, 'the': 5, 'only': 1, 'thing': ...
3 Watch Taxi Driver instead\nThis is a poor att... N [watch, taxi, driver, instead, this, is, a, po... 123 {'watch': 1, 'taxi': 2, 'driver': 2, 'instead'...
4 I learned one thing.\nIt borrows a lot of ele... N [i, learned, one, thing, it, borrows, a, lot, ... 70 {'i': 1, 'learned': 1, 'one': 1, 'thing': 1, '...

STEP 5: Vectorize -- Create a Frequency Distribution Matrix

In [25]:
freq_df = pd.DataFrame(all_df['bow'].tolist())
freq_df = freq_df.fillna(0).astype(int)
freq_df[:5]
Out[25]:
everyone praised an overrated movie of all time the reviews ... easy answers questions raises albeit reinvention source material alike disturbed
0 1 1 1 2 2 1 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
1 0 0 0 0 2 0 0 0 2 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0 0 1 4 2 0 0 5 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0 0 0 3 5 0 0 9 0 ... 0 0 0 0 0 0 0 0 0 0
4 1 0 1 0 1 2 0 0 1 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 2648 columns

STEP 6: Normalize

With simple weights

With TFIDF

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf =TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
data =tfidf.fit_transform(freq_df.values)
tfidf_reduced = pd.DataFrame(data.todense())
In [27]:
tfidf_reduced[:5]
Out[27]:
0 1 2 3 4 5 6 7 8 9 ... 2638 2639 2640 2641 2642 2643 2644 2645 2646 2647
0 0.200322 0.340128 0.127248 0.553100 0.190896 0.083310 0.130553 0.162156 0.073724 0.221842 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 0.000000 0.000000 0.000000 0.000000 0.112320 0.000000 0.000000 0.000000 0.086756 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
2 0.000000 0.000000 0.000000 0.106806 0.147451 0.064349 0.000000 0.000000 0.142363 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
3 0.000000 0.000000 0.000000 0.000000 0.109289 0.158984 0.000000 0.000000 0.253245 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
4 0.106888 0.000000 0.067897 0.000000 0.050929 0.088905 0.000000 0.000000 0.039338 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 2648 columns

STEP 7: Test

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

def get_NB(small_df, labels):
    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    from sklearn import metrics
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
In [32]:
get_NB(tfidf_reduced, all_df['PoN'])
Accuracy: 0.7666666666666667
In [ ]: