HW4 [Deception]

STEP 1: GET THAT DATA

In [172]:
import os
def get_data(file, path):
    f=open(path+file)
    data = f.read()
    f.close()
    return data
    
def get_data_from_files(path):
    results = [get_data(file, path) for file in os.listdir(path)]
    return results

# pos = get_data_from_files('../pos_cornell//')
# neg = get_data_from_files('../neg_cornell/')
pos = get_data_from_files('../hw4_lie_false/')
neg = get_data_from_files('../hw4_lie_true/')
In [173]:
import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
all_df.reset_index(drop=True,inplace=True)
all_df
Out[173]:
0 PoN
0 ? N
1 Twin Trees Cicero NY HUGE salad bar and high q... N
2 The worst restaurant that I have ever eaten in... N
3 ? N
4 I have been to a Asian restaurant in New York ... N
... ... ...
87 Mikes Pizza High Point NY Service was very slo... P
88 After I went shopping with some of my friend w... P
89 I entered the restaurant and a waitress came b... P
90 Carlos Plate Shack was the worst dining experi... P
91 Olive Oil Garden was very disappointing. I exp... P

92 rows × 2 columns

STEP 2: TOKENIZE

In [174]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

-- 2a by sentence

In [175]:
def get_sentence_tokens(review):
    return sent_tokenize(review)
    
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)

-- 2b by word

In [176]:
def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)
In [177]:
all_df[:3]
Out[177]:
0 PoN sentences num_sentences tokens num_tokens
0 ? N [?] 1 [] 0
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105

STEP 3: EXPERIMENT

Experiment with: stopwords, stemming, lemming etc.

-- 3a remove english stopwords

In [178]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)
In [179]:
all_df[:3]
Out[179]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw
0 ? N [?] 1 [] 0 [] 0
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49

-- 3b get stems for both tokens and no_sw

In [180]:
from nltk.stem import PorterStemmer
def get_stems(sentence):
    ps = PorterStemmer()
    return [ps.stem(w) for w in sentence]
    
all_df['stemmed'] = all_df.apply(lambda x: get_stems(x['tokens']),axis=1)
all_df['stemmed_no_sw'] = all_df.apply(lambda x: get_stems(x['no_sw']),axis=1)
In [181]:
all_df[:3]
Out[181]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw
0 ? N [?] 1 [] 0 [] 0 [] []
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla...

-- 3c get lemmas for both tokens and no_sw

In [182]:
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemmas(sentence):
    lem = WordNetLemmatizer() 
    return [lem.lemmatize(w) for w in sentence]
    
all_df['lemmed'] = all_df.apply(lambda x: get_lemmas(x['tokens']),axis=1)
all_df['lemmed_no_sw'] = all_df.apply(lambda x: get_lemmas(x['no_sw']),axis=1)
In [183]:
all_df[:3]
Out[183]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw lemmed lemmed_no_sw
0 ? N [?] 1 [] 0 [] 0 [] [] [] []
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla... [the, worst, restaurant, that, i, have, ever, ... [worst, restaurant, ever, eaten, undoubtedly, ...
In [184]:
all_df['pos'] = all_df.apply(lambda x: nltk.pos_tag(x['tokens']),axis=1)
all_df['pos_no_sw'] = all_df.apply(lambda x: nltk.pos_tag(x['no_sw']),axis=1)
In [185]:
def get_pos_dict(pos_tuple):
    pos_dict = {}
    for t in pos_tuple:
        if t[1] in pos_dict.keys():
            pos_dict[t[1]] += 1
        else:
            pos_dict.update({t[1]: 1})
    return pos_dict

all_df['pos_dict'] = all_df.apply(lambda x: get_pos_dict(x['pos']), axis=1)
all_df['pos_dict_no_sw'] = all_df.apply(lambda x: get_pos_dict(x['pos_no_sw']), axis=1)
all_df[:3]
Out[185]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw lemmed lemmed_no_sw pos pos_no_sw pos_dict pos_dict_no_sw
0 ? N [?] 1 [] 0 [] 0 [] [] [] [] [] [] {} {}
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla... [the, worst, restaurant, that, i, have, ever, ... [worst, restaurant, ever, eaten, undoubtedly, ... [(the, DT), (worst, JJS), (restaurant, NN), (t... [(worst, RBS), (restaurant, NN), (ever, RB), (... {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...
In [198]:
def get_bow_from_tokens(df, column):
    all_column_data = ' '.join(df[column].tolist())
    all_column_fd = Counter(all_column_data.split())
    return all_column_fd

# bow = get_bow_from_column(all_df, 'diy_cleaner')
# bow =
from collections import Counter
all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)
all_df['bow_no_sw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)
all_df[:3]
Out[198]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw lemmed lemmed_no_sw pos pos_no_sw pos_dict pos_dict_no_sw bow bow_no_sw
0 ? N [?] 1 [] 0 [] 0 [] [] [] [] [] [] {} {} {} {}
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla... [the, worst, restaurant, that, i, have, ever, ... [worst, restaurant, ever, eaten, undoubtedly, ... [(the, DT), (worst, JJS), (restaurant, NN), (t... [(worst, RBS), (restaurant, NN), (ever, RB), (... {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... {'the': 6, 'worst': 1, 'restaurant': 1, 'that'... {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...

STEP 4: TEST EXPERIMENTS!!

In [203]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

def get_NB(small_df, labels):
    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    from sklearn import metrics
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
In [204]:
new_df = pd.DataFrame(all_df['pos_dict'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
get_NB(new_df, new_df.index)
new_df[:5]
Accuracy: 0.5714285714285714
Out[204]:
NN NNS VBP JJ CC VBZ DT RB VB TO ... VBG EX JJR PDT RP WP CD RBR MD RBS
PoN
N 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
N 11 3 3 9 3 2 4 4 4 3 ... 0 0 0 0 0 0 0 0 0 0
N 29 1 1 7 5 1 14 8 4 4 ... 1 0 0 0 0 0 0 0 0 0
N 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
N 13 2 2 5 1 2 5 0 0 1 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 28 columns

In [219]:
# def normalize_df(df):
#     df["total"] = df.sum(axis = 1)
#     df = df.apply(lambda row: row/row["total"], axis = 1)
#     df.drop("total", axis=1, inplace = True)
#     return(df)

def normalize_df(df):
    names = df.columns
    df["total"] = df.sum(axis = 1)
    for name in names:
        df[name] = df[name]/df["total"]
    df.drop("total", axis =1 , inplace = True)
    return(df)
In [222]:
norm_df = normalize_df(new_df)

# new_df['total'] = new_df.sum(axis = 1)
# new_df_norm = new_df.copy()
# new_df_norm = new_df_norm.apply(lambda x: x/x['total'], axis=1)

# new_df_norm = new_df_norm.drop('total', axis=1)
# norm_df = norm_df.fillna(0).astype(int)
# get_NB(new_df_norm, new_df_norm.index)
# new_df_norm[:5]
norm_df
# new_df
Out[222]:
NN NNS VBP JJ CC VBZ DT RB VB TO ... VBG EX JJR PDT RP WP CD RBR MD RBS
PoN
N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
N 0.207547 0.056604 0.056604 0.169811 0.056604 0.037736 0.075472 0.075472 0.075472 0.056604 ... 0.000000 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.000000
N 0.276190 0.009524 0.009524 0.066667 0.047619 0.009524 0.133333 0.076190 0.038095 0.038095 ... 0.009524 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.000000
N NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
N 0.288889 0.044444 0.044444 0.111111 0.022222 0.044444 0.111111 0.000000 0.000000 0.022222 ... 0.000000 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.000000 0.000000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
P 0.162791 0.046512 0.023256 0.139535 0.046512 0.000000 0.046512 0.069767 0.093023 0.046512 ... 0.023256 0.0 0.0 0.0 0.000000 0.0 0.000000 0.0 0.069767 0.000000
P 0.208333 0.041667 0.000000 0.041667 0.000000 0.000000 0.083333 0.000000 0.041667 0.041667 ... 0.041667 0.0 0.0 0.0 0.000000 0.0 0.041667 0.0 0.000000 0.000000
P 0.191919 0.010101 0.000000 0.070707 0.070707 0.010101 0.141414 0.101010 0.070707 0.040404 ... 0.010101 0.0 0.0 0.0 0.000000 0.0 0.010101 0.0 0.010101 0.020202
P 0.206452 0.045161 0.019355 0.090323 0.045161 0.000000 0.096774 0.070968 0.045161 0.045161 ... 0.012903 0.0 0.0 0.0 0.025806 0.0 0.000000 0.0 0.000000 0.000000
P 0.232558 0.023256 0.046512 0.139535 0.046512 0.000000 0.093023 0.069767 0.046512 0.046512 ... 0.000000 0.0 0.0 0.0 0.023256 0.0 0.000000 0.0 0.000000 0.000000

92 rows × 28 columns

In [197]:
new_df = pd.DataFrame(all_df['bow_no_sw'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)
Accuracy: 0.5
In [ ]: