HW3 JOKER EXTREMES¶

STEP 1: Import Data¶

import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results

neg = get_data_from_files('../NEG_JK_E/')
pos = get_data_from_files('../POS_JK_E/')

print('Neg Reviews:', len(neg))
print('Pos Reviews:', len(pos))

Neg Reviews: 48
Pos Reviews: 50

STEP 2: Turn into DF & Label it¶

import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)

Add labels¶

pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'

all_df = neg_df.append(pos_df)

all_df[:5]

STEP 3: Tokenize and Clean!¶

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)

all_df[:5]

STEP 4: Create Bag of Words¶

from nltk.tokenize import casual_tokenize
from collections import Counter
# all_df['bow'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)
all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)

all_df[:5]

STEP 5: Vectorize -- Create a Frequency Distribution Matrix¶

freq_df = pd.DataFrame(all_df['bow'].tolist())
freq_df = freq_df.fillna(0).astype(int)
freq_df[:5]

STEP 6: Normalize¶

With simple weights¶

With TFIDF¶

from sklearn.feature_extraction.text import TfidfTransformer
tfidf =TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
data =tfidf.fit_transform(freq_df.values)
tfidf_reduced = pd.DataFrame(data.todense())

tfidf_reduced[:5]

STEP 7: Test¶

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

def get_NB(small_df, labels):
    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    from sklearn import metrics
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

get_NB(tfidf_reduced, all_df['PoN'])

Accuracy: 0.7666666666666667

	0	PoN
0	Everyone praised an overrated movie.\nOverrat...	N
1	What idiotic FIlm\nI can say that Phoenix is ...	N
2	Terrible\nThe only thing good about this movi...	N
3	Watch Taxi Driver instead\nThis is a poor att...	N
4	I learned one thing.\nIt borrows a lot of ele...	N

	0	PoN	tokens	num_tokens
0	Everyone praised an overrated movie.\nOverrat...	N	[everyone, praised, an, overrated, movie, over...	26
1	What idiotic FIlm\nI can say that Phoenix is ...	N	[what, idiotic, film, i, can, say, that, phoen...	66
2	Terrible\nThe only thing good about this movi...	N	[terrible, the, only, thing, good, about, this...	124
3	Watch Taxi Driver instead\nThis is a poor att...	N	[watch, taxi, driver, instead, this, is, a, po...	123
4	I learned one thing.\nIt borrows a lot of ele...	N	[i, learned, one, thing, it, borrows, a, lot, ...	70

	0	PoN	tokens	num_tokens	bow
0	Everyone praised an overrated movie.\nOverrat...	N	[everyone, praised, an, overrated, movie, over...	26	{'everyone': 1, 'praised': 1, 'an': 1, 'overra...
1	What idiotic FIlm\nI can say that Phoenix is ...	N	[what, idiotic, film, i, can, say, that, phoen...	66	{'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '...
2	Terrible\nThe only thing good about this movi...	N	[terrible, the, only, thing, good, about, this...	124	{'terrible': 3, 'the': 5, 'only': 1, 'thing': ...
3	Watch Taxi Driver instead\nThis is a poor att...	N	[watch, taxi, driver, instead, this, is, a, po...	123	{'watch': 1, 'taxi': 2, 'driver': 2, 'instead'...
4	I learned one thing.\nIt borrows a lot of ele...	N	[i, learned, one, thing, it, borrows, a, lot, ...	70	{'i': 1, 'learned': 1, 'one': 1, 'thing': 1, '...

	everyone	praised	an	overrated	movie	of	all	time	the	reviews	...
0	1	1	1	2	2	1	1	1	1	1	...
1	0	0	0	0	2	0	0	0	2	0	...
2	0	0	0	1	4	2	0	0	5	0	...
3	0	0	0	0	3	5	0	0	9	0	...
4	1	0	1	0	1	2	0	0	1	0	...

	0	1	2	3	4	5	6	7	8	9	...
0	0.200322	0.340128	0.127248	0.553100	0.190896	0.083310	0.130553	0.162156	0.073724	0.221842	...
1	0.000000	0.000000	0.000000	0.000000	0.112320	0.000000	0.000000	0.000000	0.086756	0.000000	...
2	0.000000	0.000000	0.000000	0.106806	0.147451	0.064349	0.000000	0.000000	0.142363	0.000000	...
3	0.000000	0.000000	0.000000	0.000000	0.109289	0.158984	0.000000	0.000000	0.253245	0.000000	...
4	0.106888	0.000000	0.067897	0.000000	0.050929	0.088905	0.000000	0.000000	0.039338	0.000000	...

	everyone	praised	an	overrated	movie	of	all	time	the	reviews	...
0	1	1	1	2	2	1	1	1	1	1	...
1	0	0	0	0	2	0	0	0	2	0	...
2	0	0	0	1	4	2	0	0	5	0	...
3	0	0	0	0	3	5	0	0	9	0	...
4	1	0	1	0	1	2	0	0	1	0	...

	everyone	praised	an	overrated	movie	of	all	time	the	reviews	...
0	1	1	1	2	2	1	1	1	1	1	...
1	0	0	0	0	2	0	0	0	2	0	...
2	0	0	0	1	4	2	0	0	5	0	...
3	0	0	0	0	3	5	0	0	9	0	...
4	1	0	1	0	1	2	0	0	1	0	...

	everyone	praised	an	overrated	movie	of	all	time	the	reviews	...
0	1	1	1	2	2	1	1	1	1	1	...
1	0	0	0	0	2	0	0	0	2	0	...
2	0	0	0	1	4	2	0	0	5	0	...
3	0	0	0	0	3	5	0	0	9	0	...
4	1	0	1	0	1	2	0	0	1	0	...