HW4 [Deception]¶

STEP 1: GET THAT DATA¶

import os
def get_data(file, path):
    f=open(path+file)
    data = f.read()
    f.close()
    return data
    
def get_data_from_files(path):
    results = [get_data(file, path) for file in os.listdir(path)]
    return results

# pos = get_data_from_files('../pos_cornell//')
# neg = get_data_from_files('../neg_cornell/')
pos = get_data_from_files('../hw4_lie_false/')
neg = get_data_from_files('../hw4_lie_true/')

import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
all_df.reset_index(drop=True,inplace=True)
all_df

STEP 2: TOKENIZE¶

from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

-- 2a by sentence¶

def get_sentence_tokens(review):
    return sent_tokenize(review)
    
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)

-- 2b by word¶

def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)

all_df[:3]

STEP 3: EXPERIMENT¶

Experiment with: stopwords, stemming, lemming etc.¶

-- 3a remove english stopwords¶

from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)

all_df[:3]

-- 3b get stems for both tokens and no_sw¶

from nltk.stem import PorterStemmer
def get_stems(sentence):
    ps = PorterStemmer()
    return [ps.stem(w) for w in sentence]
    
all_df['stemmed'] = all_df.apply(lambda x: get_stems(x['tokens']),axis=1)
all_df['stemmed_no_sw'] = all_df.apply(lambda x: get_stems(x['no_sw']),axis=1)

all_df[:3]

-- 3c get lemmas for both tokens and no_sw¶

from nltk.stem.wordnet import WordNetLemmatizer
def get_lemmas(sentence):
    lem = WordNetLemmatizer() 
    return [lem.lemmatize(w) for w in sentence]
    
all_df['lemmed'] = all_df.apply(lambda x: get_lemmas(x['tokens']),axis=1)
all_df['lemmed_no_sw'] = all_df.apply(lambda x: get_lemmas(x['no_sw']),axis=1)

all_df[:3]

all_df['pos'] = all_df.apply(lambda x: nltk.pos_tag(x['tokens']),axis=1)
all_df['pos_no_sw'] = all_df.apply(lambda x: nltk.pos_tag(x['no_sw']),axis=1)

def get_pos_dict(pos_tuple):
    pos_dict = {}
    for t in pos_tuple:
        if t[1] in pos_dict.keys():
            pos_dict[t[1]] += 1
        else:
            pos_dict.update({t[1]: 1})
    return pos_dict

all_df['pos_dict'] = all_df.apply(lambda x: get_pos_dict(x['pos']), axis=1)
all_df['pos_dict_no_sw'] = all_df.apply(lambda x: get_pos_dict(x['pos_no_sw']), axis=1)
all_df[:3]

def get_bow_from_tokens(df, column):
    all_column_data = ' '.join(df[column].tolist())
    all_column_fd = Counter(all_column_data.split())
    return all_column_fd

# bow = get_bow_from_column(all_df, 'diy_cleaner')
# bow =
from collections import Counter
all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)
all_df['bow_no_sw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)
all_df[:3]

STEP 4: TEST EXPERIMENTS!!¶

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

def get_NB(small_df, labels):
    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    from sklearn import metrics
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

new_df = pd.DataFrame(all_df['pos_dict'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
get_NB(new_df, new_df.index)
new_df[:5]

Accuracy: 0.5714285714285714

# def normalize_df(df):
#     df["total"] = df.sum(axis = 1)
#     df = df.apply(lambda row: row/row["total"], axis = 1)
#     df.drop("total", axis=1, inplace = True)
#     return(df)

def normalize_df(df):
    names = df.columns
    df["total"] = df.sum(axis = 1)
    for name in names:
        df[name] = df[name]/df["total"]
    df.drop("total", axis =1 , inplace = True)
    return(df)

norm_df = normalize_df(new_df)

# new_df['total'] = new_df.sum(axis = 1)
# new_df_norm = new_df.copy()
# new_df_norm = new_df_norm.apply(lambda x: x/x['total'], axis=1)

# new_df_norm = new_df_norm.drop('total', axis=1)
# norm_df = norm_df.fillna(0).astype(int)
# get_NB(new_df_norm, new_df_norm.index)
# new_df_norm[:5]
norm_df
# new_df

new_df = pd.DataFrame(all_df['bow_no_sw'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)

Accuracy: 0.5

	0	PoN
0	?	N
1	Twin Trees Cicero NY HUGE salad bar and high q...	N
2	The worst restaurant that I have ever eaten in...	N
3	?	N
4	I have been to a Asian restaurant in New York ...	N
...	...	...
87	Mikes Pizza High Point NY Service was very slo...	P
88	After I went shopping with some of my friend w...	P
89	I entered the restaurant and a waitress came b...	P
90	Carlos Plate Shack was the worst dining experi...	P
91	Olive Oil Garden was very disappointing. I exp...	P

	0	PoN	sentences	num_sentences	tokens	num_tokens
0	?	N	[?]	1	[]	0
1	Twin Trees Cicero NY HUGE salad bar and high q...	N	[Twin Trees Cicero NY HUGE salad bar and high ...	4	[twin, trees, cicero, ny, huge, salad, bar, an...	53
2	The worst restaurant that I have ever eaten in...	N	[The worst restaurant that I have ever eaten i...	5	[the, worst, restaurant, that, i, have, ever, ...	105

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw
0	?	N	[?]	1	[]	0	[]	0
1	Twin Trees Cicero NY HUGE salad bar and high q...	N	[Twin Trees Cicero NY HUGE salad bar and high ...	4	[twin, trees, cicero, ny, huge, salad, bar, an...	53	[twin, trees, cicero, ny, huge, salad, bar, hi...	32
2	The worst restaurant that I have ever eaten in...	N	[The worst restaurant that I have ever eaten i...	5	[the, worst, restaurant, that, i, have, ever, ...	105	[worst, restaurant, ever, eaten, undoubtedly, ...	49

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw	stemmed	stemmed_no_sw
0	?	N	[?]	1	[]	0	[]	0	[]	[]
1	Twin Trees Cicero NY HUGE salad bar and high q...	N	[Twin Trees Cicero NY HUGE salad bar and high ...	4	[twin, trees, cicero, ny, huge, salad, bar, an...	53	[twin, trees, cicero, ny, huge, salad, bar, hi...	32	[twin, tree, cicero, ny, huge, salad, bar, and...	[twin, tree, cicero, ny, huge, salad, bar, hig...
2	The worst restaurant that I have ever eaten in...	N	[The worst restaurant that I have ever eaten i...	5	[the, worst, restaurant, that, i, have, ever, ...	105	[worst, restaurant, ever, eaten, undoubtedly, ...	49	[the, worst, restaur, that, i, have, ever, eat...	[worst, restaur, ever, eaten, undoubtedli, pla...

	0	PoN	sentences	num_sentences	tokens	num_tokens	no_sw	num_no_sw	stemmed	stemmed_no_sw	lemmed	lemmed_no_sw
0	?	N	[?]	1	[]	0	[]	0	[]	[]	[]	[]
1	Twin Trees Cicero NY HUGE salad bar and high q...	N	[Twin Trees Cicero NY HUGE salad bar and high ...	4	[twin, trees, cicero, ny, huge, salad, bar, an...	53	[twin, trees, cicero, ny, huge, salad, bar, hi...	32	[twin, tree, cicero, ny, huge, salad, bar, and...	[twin, tree, cicero, ny, huge, salad, bar, hig...	[twin, tree, cicero, ny, huge, salad, bar, and...	[twin, tree, cicero, ny, huge, salad, bar, hig...
2	The worst restaurant that I have ever eaten in...	N	[The worst restaurant that I have ever eaten i...	5	[the, worst, restaurant, that, i, have, ever, ...	105	[worst, restaurant, ever, eaten, undoubtedly, ...	49	[the, worst, restaur, that, i, have, ever, eat...	[worst, restaur, ever, eaten, undoubtedli, pla...	[the, worst, restaurant, that, i, have, ever, ...	[worst, restaurant, ever, eaten, undoubtedly, ...

	NN	NNS	VBP	JJ	CC	VBZ	DT	RB	VB	TO	...	VBG	EX	JJR	PDT	RP	WP	CD	RBR	MD	RBS
PoN
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	11	3	3	9	3	2	4	4	4	3	...	0	0	0	0	0	0	0	0	0	0
N	29	1	1	7	5	1	14	8	4	4	...	1	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	13	2	2	5	1	2	5	0	0	1	...	0	0	0	0	0	0	0	0	0	0

	NN	NNS	VBP	JJ	CC	VBZ	DT	RB	VB	TO	...	VBG	EX	JJR	PDT	RP	WP	CD	RBR	MD	RBS
PoN
N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
N	0.207547	0.056604	0.056604	0.169811	0.056604	0.037736	0.075472	0.075472	0.075472	0.056604	...	0.000000	0.0	0.0	0.0	0.000000	0.0	0.000000	0.0	0.000000	0.000000
N	0.276190	0.009524	0.009524	0.066667	0.047619	0.009524	0.133333	0.076190	0.038095	0.038095	...	0.009524	0.0	0.0	0.0	0.000000	0.0	0.000000	0.0	0.000000	0.000000
N	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
N	0.288889	0.044444	0.044444	0.111111	0.022222	0.044444	0.111111	0.000000	0.000000	0.022222	...	0.000000	0.0	0.0	0.0	0.000000	0.0	0.000000	0.0	0.000000	0.000000
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
P	0.162791	0.046512	0.023256	0.139535	0.046512	0.000000	0.046512	0.069767	0.093023	0.046512	...	0.023256	0.0	0.0	0.0	0.000000	0.0	0.000000	0.0	0.069767	0.000000
P	0.208333	0.041667	0.000000	0.041667	0.000000	0.000000	0.083333	0.000000	0.041667	0.041667	...	0.041667	0.0	0.0	0.0	0.000000	0.0	0.041667	0.0	0.000000	0.000000
P	0.191919	0.010101	0.000000	0.070707	0.070707	0.010101	0.141414	0.101010	0.070707	0.040404	...	0.010101	0.0	0.0	0.0	0.000000	0.0	0.010101	0.0	0.010101	0.020202
P	0.206452	0.045161	0.019355	0.090323	0.045161	0.000000	0.096774	0.070968	0.045161	0.045161	...	0.012903	0.0	0.0	0.0	0.025806	0.0	0.000000	0.0	0.000000	0.000000
P	0.232558	0.023256	0.046512	0.139535	0.046512	0.000000	0.093023	0.069767	0.046512	0.046512	...	0.000000	0.0	0.0	0.0	0.023256	0.0	0.000000	0.0	0.000000	0.000000

	NN	NNS	VBP	JJ	CC	VBZ	DT	RB	VB	TO	...	VBG	EX	JJR	PDT	RP	WP	CD	RBR	MD	RBS
PoN
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	11	3	3	9	3	2	4	4	4	3	...	0	0	0	0	0	0	0	0	0	0
N	29	1	1	7	5	1	14	8	4	4	...	1	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	13	2	2	5	1	2	5	0	0	1	...	0	0	0	0	0	0	0	0	0	0

	NN	NNS	VBP	JJ	CC	VBZ	DT	RB	VB	TO	...	VBG	EX	JJR	PDT	RP	WP	CD	RBR	MD	RBS
PoN
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	11	3	3	9	3	2	4	4	4	3	...	0	0	0	0	0	0	0	0	0	0
N	29	1	1	7	5	1	14	8	4	4	...	1	0	0	0	0	0	0	0	0	0
N	0	0	0	0	0	0	0	0	0	0	...	0	0	0	0	0	0	0	0	0	0
N	13	2	2	5	1	2	5	0	0	1	...	0	0	0	0	0	0	0	0	0	0