HW4 [Deception]

STEP 1: GET THAT DATA

In [2]:
import os
def get_data(file, path):
    f=open(path+file)
    data = f.read()
    f.close()
    return data
    
def get_data_from_files(path):
    results = [get_data(file, path) for file in os.listdir(path)]
    return results

# pos = get_data_from_files('../pos_cornell//')
# neg = get_data_from_files('../neg_cornell/')

# pos = get_data_from_files('../hw4_lie_false/')
# neg = get_data_from_files('../hw4_lie_true/')

pos = get_data_from_files('../hw4_lie_false/')
neg = get_data_from_files('../hw4_lie_true/')
In [3]:
import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
all_df.reset_index(drop=True,inplace=True)
all_df[:5]
Out[3]:
0 PoN
0 ? N
1 Twin Trees Cicero NY HUGE salad bar and high q... N
2 The worst restaurant that I have ever eaten in... N
3 ? N
4 I have been to a Asian restaurant in New York ... N

STEP 2: TOKENIZE

In [4]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *

-- 2a by sentence

In [5]:
def get_sentence_tokens(review):
    return sent_tokenize(review)
    
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)

-- 2b by word

In [6]:
def get_tokens(sentence):
    tokens = word_tokenize(sentence)
    clean_tokens = [word.lower() for word in tokens if word.isalpha()]
    return clean_tokens

all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)
In [7]:
all_df[:3]
Out[7]:
0 PoN sentences num_sentences tokens num_tokens
0 ? N [?] 1 [] 0
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105

-- 2c Remove if tokens < 1

In [8]:
all_df = all_df.drop(all_df[all_df.num_tokens < 1].index)
all_df[:3]
Out[8]:
0 PoN sentences num_sentences tokens num_tokens
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105
4 I have been to a Asian restaurant in New York ... N [I have been to a Asian restaurant in New York... 4 [i, have, been, to, a, asian, restaurant, in, ... 45

STEP 3: EXPERIMENT

Experiment with: stopwords, stemming, lemming etc.

In [ ]:
 

-- 3a remove english stopwords

In [9]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
    filtered_text = []
    for word in sentence:
        if word not in stop_words:
            filtered_text.append(word)
    return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)
In [10]:
all_df[:3]
Out[10]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49
4 I have been to a Asian restaurant in New York ... N [I have been to a Asian restaurant in New York... 4 [i, have, been, to, a, asian, restaurant, in, ... 45 [asian, restaurant, new, york, city, menu, wri... 23

-- 3b get stems for both tokens and no_sw

In [11]:
from nltk.stem import PorterStemmer
def get_stems(sentence):
    ps = PorterStemmer()
    return [ps.stem(w) for w in sentence]
    
all_df['stemmed'] = all_df.apply(lambda x: get_stems(x['tokens']),axis=1)
all_df['stemmed_no_sw'] = all_df.apply(lambda x: get_stems(x['no_sw']),axis=1)
In [12]:
all_df[:3]
Out[12]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla...
4 I have been to a Asian restaurant in New York ... N [I have been to a Asian restaurant in New York... 4 [i, have, been, to, a, asian, restaurant, in, ... 45 [asian, restaurant, new, york, city, menu, wri... 23 [i, have, been, to, a, asian, restaur, in, new... [asian, restaur, new, york, citi, menu, writte...

-- 3c get lemmas for both tokens and no_sw

In [13]:
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemmas(sentence):
    lem = WordNetLemmatizer() 
    return [lem.lemmatize(w) for w in sentence]
    
all_df['lemmed'] = all_df.apply(lambda x: get_lemmas(x['tokens']),axis=1)
all_df['lemmed_no_sw'] = all_df.apply(lambda x: get_lemmas(x['no_sw']),axis=1)
In [14]:
all_df[:3]
Out[14]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw lemmed lemmed_no_sw
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla... [the, worst, restaurant, that, i, have, ever, ... [worst, restaurant, ever, eaten, undoubtedly, ...
4 I have been to a Asian restaurant in New York ... N [I have been to a Asian restaurant in New York... 4 [i, have, been, to, a, asian, restaurant, in, ... 45 [asian, restaurant, new, york, city, menu, wri... 23 [i, have, been, to, a, asian, restaur, in, new... [asian, restaur, new, york, citi, menu, writte... [i, have, been, to, a, asian, restaurant, in, ... [asian, restaurant, new, york, city, menu, wri...
In [15]:
all_df['pos'] = all_df.apply(lambda x: nltk.pos_tag(x['tokens']),axis=1)
all_df['pos_no_sw'] = all_df.apply(lambda x: nltk.pos_tag(x['no_sw']),axis=1)
In [16]:
def get_pos_dict(pos_tuple):
    pos_dict = {}
    for t in pos_tuple:
        if t[1] in pos_dict.keys():
            pos_dict[t[1]] += 1
        else:
            pos_dict.update({t[1]: 1})
    return pos_dict

all_df['pos_dict'] = all_df.apply(lambda x: get_pos_dict(x['pos']), axis=1)
all_df['pos_dict_no_sw'] = all_df.apply(lambda x: get_pos_dict(x['pos_no_sw']), axis=1)
all_df[:3]
Out[16]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw lemmed lemmed_no_sw pos pos_no_sw pos_dict pos_dict_no_sw
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla... [the, worst, restaurant, that, i, have, ever, ... [worst, restaurant, ever, eaten, undoubtedly, ... [(the, DT), (worst, JJS), (restaurant, NN), (t... [(worst, RBS), (restaurant, NN), (ever, RB), (... {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...
4 I have been to a Asian restaurant in New York ... N [I have been to a Asian restaurant in New York... 4 [i, have, been, to, a, asian, restaurant, in, ... 45 [asian, restaurant, new, york, city, menu, wri... 23 [i, have, been, to, a, asian, restaur, in, new... [asian, restaur, new, york, citi, menu, writte... [i, have, been, to, a, asian, restaurant, in, ... [asian, restaurant, new, york, city, menu, wri... [(i, NNS), (have, VBP), (been, VBN), (to, TO),... [(asian, JJ), (restaurant, NN), (new, JJ), (yo... {'NNS': 2, 'VBP': 2, 'VBN': 5, 'TO': 1, 'DT': ... {'JJ': 5, 'NN': 11, 'VBN': 2, 'VBP': 1, 'NNS':...
In [17]:
# def get_bow_from_tokens(df, column):
#     all_column_data = ' '.join(df[column].tolist())
#     all_column_fd = Counter(all_column_data.split())
#     return all_column_fd

# # bow = get_bow_from_column(all_df, 'diy_cleaner')
# # bow =
from collections import Counter
all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)
all_df['bow_no_sw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)
all_df[:3]
Out[17]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw lemmed lemmed_no_sw pos pos_no_sw pos_dict pos_dict_no_sw bow bow_no_sw
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla... [the, worst, restaurant, that, i, have, ever, ... [worst, restaurant, ever, eaten, undoubtedly, ... [(the, DT), (worst, JJS), (restaurant, NN), (t... [(worst, RBS), (restaurant, NN), (ever, RB), (... {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... {'the': 6, 'worst': 1, 'restaurant': 1, 'that'... {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...
4 I have been to a Asian restaurant in New York ... N [I have been to a Asian restaurant in New York... 4 [i, have, been, to, a, asian, restaurant, in, ... 45 [asian, restaurant, new, york, city, menu, wri... 23 [i, have, been, to, a, asian, restaur, in, new... [asian, restaur, new, york, citi, menu, writte... [i, have, been, to, a, asian, restaurant, in, ... [asian, restaurant, new, york, city, menu, wri... [(i, NNS), (have, VBP), (been, VBN), (to, TO),... [(asian, JJ), (restaurant, NN), (new, JJ), (yo... {'NNS': 2, 'VBP': 2, 'VBN': 5, 'TO': 1, 'DT': ... {'JJ': 5, 'NN': 11, 'VBN': 2, 'VBP': 1, 'NNS':... {'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3... {'asian': 1, 'restaurant': 1, 'new': 1, 'york'...
In [18]:
all_df_n = all_df[all_df['PoN'] == 'N']
all_df_p = all_df[all_df['PoN'] == 'P']

big_bow = [item for review in all_df['bow'].tolist() for item in review]
big_bow_n = [item for review in all_df_n['bow'].tolist() for item in review]
big_bow_p = [item for review in all_df_p['bow'].tolist() for item in review]

df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()
df = df.rename(columns={'index':'word', 0:'count'})

df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()
df_n = df_n.rename(columns={'index':'word', 0:'count'})

df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()
df_p = df_p.rename(columns={'index':'word', 0:'count'})
In [19]:
import seaborn as sns
import matplotlib.pyplot as plt 
def bar_plot(df, title): 
    graph = sns.barplot(y = "count", x = "word", data = df, palette = "husl")
    plt.title(title)
    plt.xlabel("Word")
    plt.ylabel("Count")
    sns.set_context("talk")
    plt.xticks(rotation = 90)
    return plt

print(bar_plot(df.sort_values(by=["count"], ascending=False)[:20], "Top 20 Items (ALL) Prior to Cleaning"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [20]:
print(bar_plot(df_n.sort_values(by=["count"], ascending=False)[:20], "Top 20 Items (TRUE) Prior to Cleaning"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [21]:
print(bar_plot(df_p.sort_values(by=["count"], ascending=False)[:20], "Top 20 Items (FALSE) Prior to Cleaning"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [22]:
all_df_n = all_df[all_df['PoN'] == 'N']
all_df_p = all_df[all_df['PoN'] == 'P']

big_bow = [item for review in all_df['bow_no_sw'].tolist() for item in review]
big_bow_n = [item for review in all_df_n['bow_no_sw'].tolist() for item in review]
big_bow_p = [item for review in all_df_p['bow_no_sw'].tolist() for item in review]

df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()
df = df.rename(columns={'index':'word', 0:'count'})

df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()
df_n = df_n.rename(columns={'index':'word', 0:'count'})

df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()
df_p = df_p.rename(columns={'index':'word', 0:'count'})
In [23]:
print(bar_plot(df.sort_values(by=["count"], ascending=False)[:20], "Top 20 Items (ALL) Prior to Cleaning"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [24]:
print(bar_plot(df_n.sort_values(by=["count"], ascending=False)[:20], "Top 20 Items (TRUE) Stopwords Removed"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [25]:
print(bar_plot(df_p.sort_values(by=["count"], ascending=False)[:20], "Top 20 Items (FALSE) Stopwords Removed"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [26]:
all_df_n = all_df[all_df['PoN'] == 'N']
all_df_p = all_df[all_df['PoN'] == 'P']

big_bow = [item for review in all_df['pos_dict'].tolist() for item in review]
big_bow_n = [item for review in all_df_n['pos_dict'].tolist() for item in review]
big_bow_p = [item for review in all_df_p['pos_dict'].tolist() for item in review]

df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()
df = df.rename(columns={'index':'word', 0:'count'})

df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()
df_n = df_n.rename(columns={'index':'word', 0:'count'})

df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()
df_p = df_p.rename(columns={'index':'word', 0:'count'})
In [27]:
print(bar_plot(df.sort_values(by=["count"], ascending=False)[:10], "Top 10 Items (ALL) Prior to Cleaning"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [28]:
print(bar_plot(df_n.sort_values(by=["count"], ascending=False)[:10], "Top 10 POS (TRUE) Prior to Cleaning"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [29]:
print(bar_plot(df_p.sort_values(by=["count"], ascending=False)[:10], "Top 10 POS (FALSE) Prior to Cleaning"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>

STEP 4: TEST EXPERIMENTS!!

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

def get_NB(small_df, labels):
    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)

    gnb = GaussianNB()
    gnb.fit(x_train, y_train)
    y_pred = gnb.predict(x_test)
    from sklearn import metrics
    print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

TEST 1: Parts of speech frequency distribution

In [31]:
pos_df = pd.DataFrame(all_df['pos_dict'].tolist(), all_df['PoN'])
pos_df[:3]
Out[31]:
NN NNS VBP JJ CC VBZ DT RB VB TO ... VBG EX JJR PDT RP WP CD RBR MD RBS
PoN
N 11.0 3.0 3.0 9.0 3.0 2.0 4.0 4.0 4.0 3.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
N 29.0 1.0 1.0 7.0 5.0 1.0 14.0 8.0 4.0 4.0 ... 1.0 NaN NaN NaN NaN NaN NaN NaN NaN NaN
N 13.0 2.0 2.0 5.0 1.0 2.0 5.0 NaN NaN 1.0 ... NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN

3 rows × 28 columns

In [32]:
pos_df = pos_df.fillna(0).astype(int)
pos_df[:3]
Out[32]:
NN NNS VBP JJ CC VBZ DT RB VB TO ... VBG EX JJR PDT RP WP CD RBR MD RBS
PoN
N 11 3 3 9 3 2 4 4 4 3 ... 0 0 0 0 0 0 0 0 0 0
N 29 1 1 7 5 1 14 8 4 4 ... 1 0 0 0 0 0 0 0 0 0
N 13 2 2 5 1 2 5 0 0 1 ... 0 0 0 0 0 0 0 0 0 0

3 rows × 28 columns

In [33]:
get_NB(pos_df, pos_df.index)
Accuracy: 0.5925925925925926

TEST 1b: Normalized parts of speech frequency distribution

In [34]:
pos_df_norm = pos_df.copy()
pos_df_norm = pos_df_norm.apply(lambda x: x/x.sum(), axis=1)
pos_df_norm[:3]
pos_df_norm[1:]
test = pos_df.copy()
test['total'] = test.sum(axis = 1)
test[:3]
Out[34]:
NN NNS VBP JJ CC VBZ DT RB VB TO ... EX JJR PDT RP WP CD RBR MD RBS total
PoN
N 11 3 3 9 3 2 4 4 4 3 ... 0 0 0 0 0 0 0 0 0 53
N 29 1 1 7 5 1 14 8 4 4 ... 0 0 0 0 0 0 0 0 0 105
N 13 2 2 5 1 2 5 0 0 1 ... 0 0 0 0 0 0 0 0 0 45

3 rows × 29 columns

In [35]:
pos_df_norm[:3]
Out[35]:
NN NNS VBP JJ CC VBZ DT RB VB TO ... VBG EX JJR PDT RP WP CD RBR MD RBS
PoN
N 0.207547 0.056604 0.056604 0.169811 0.056604 0.037736 0.075472 0.075472 0.075472 0.056604 ... 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N 0.276190 0.009524 0.009524 0.066667 0.047619 0.009524 0.133333 0.076190 0.038095 0.038095 ... 0.009524 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
N 0.288889 0.044444 0.044444 0.111111 0.022222 0.044444 0.111111 0.000000 0.000000 0.022222 ... 0.000000 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

3 rows × 28 columns

In [36]:
get_NB(pos_df_norm, pos_df.index)
Accuracy: 0.5925925925925926
In [37]:
# small_df
small_df = pos_df_norm.filter(['PRP', 'PRP$','NN'])
get_NB(small_df, pos_df.index)
Accuracy: 0.4444444444444444
In [38]:
pos_df_n = pos_df[pos_df.index == 'N']
pos_df_p = pos_df[pos_df.index == 'P']
print(pos_df['PRP'].sum())
print(pos_df_n['PRP'].sum())
print(pos_df_p['PRP'].sum())
print(pos_df_n['PRP'].sum()/pos_df['PRP'].sum())
print(pos_df_p['PRP'].sum()/pos_df['PRP'].sum())
337
160
177
0.47477744807121663
0.5252225519287834
In [39]:
pos_df_n = pos_df[pos_df.index == 'N']
pos_df_p = pos_df[pos_df.index == 'P']
print(pos_df['PRP$'].sum())
print(pos_df_n['PRP$'].sum())
print(pos_df_p['PRP$'].sum())
138
65
73
In [40]:
pos_df_n = pos_df_norm[pos_df_norm.index == 'N']
pos_df_p = pos_df_norm[pos_df_norm.index == 'P']
print(pos_df_norm['PRP'].sum())
print(pos_df_n['PRP'].sum())
print(pos_df_p['PRP'].sum())
4.256356712105416
2.060598739935355
2.19575797217006
In [41]:
pos_df_n = pos_df_norm[pos_df_norm.index == 'N']
pos_df_p = pos_df_norm[pos_df_norm.index == 'P']
print(pos_df_norm['PRP'].mean())
print(pos_df_n['PRP'].mean())
print(pos_df_p['PRP'].mean())
0.04729285235672684
0.04683178954398534
0.047733868960218695
In [42]:
pos_df_n = pos_df_norm[pos_df_norm.index == 'N']
pos_df_p = pos_df_norm[pos_df_norm.index == 'P']
print(pos_df_norm['PRP$'].mean())
print(pos_df_n['PRP$'].mean())
print(pos_df_p['PRP$'].mean())
0.0177106769174579
0.017530735194787515
0.0178827950869687
In [43]:
all_df['pos']
Out[43]:
1     [(twin, NN), (trees, NNS), (cicero, VBP), (ny,...
2     [(the, DT), (worst, JJS), (restaurant, NN), (t...
4     [(i, NNS), (have, VBP), (been, VBN), (to, TO),...
5     [(the, DT), (best, JJS), (restaurant, NN), (i,...
6     [(the, DT), (restaurant, NN), (looked, VBD), (...
                            ...                        
87    [(mikes, NNS), (pizza, VBP), (high, JJ), (poin...
88    [(after, IN), (i, JJ), (went, VBD), (shopping,...
89    [(i, NN), (entered, VBD), (the, DT), (restaura...
90    [(carlos, NN), (plate, NN), (shack, NN), (was,...
91    [(olive, JJ), (oil, NN), (garden, NN), (was, V...
Name: pos, Length: 90, dtype: object
In [44]:
all_df['pos_sent'] = all_df.apply(lambda x: [word[1] for word in x['pos']], axis=1)
all_df['pos_sent_str'] = all_df.apply(lambda x: [' '.join(x['pos_sent'])], axis=1)
all_df['pos_no_sw_sent'] = all_df.apply(lambda x: [word[1] for word in x['pos_no_sw']], axis=1)
In [45]:
type(all_df['pos_sent_str'][1])
Out[45]:
list
In [46]:
all_df['pos_sent_bi'] = all_df.apply(lambda x: [b for l in x['pos_sent_str'] for b in zip(l.split(" ")[:-1], l.split(" ")[1:])], axis=1)
# bigrams = [b for l in text for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
In [47]:
# all_df['pos_sent_tri'] = all_df.apply(lambda x: [b for l in x['pos_sent_str'] for b in zip(l.split(" ")[:-1], l.split(" ")[1:])], axis=1)
In [48]:
all_df[:4]
Out[48]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw ... pos pos_no_sw pos_dict pos_dict_no_sw bow bow_no_sw pos_sent pos_sent_str pos_no_sw_sent pos_sent_bi
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... ... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... [NN, NNS, VBP, JJ, JJ, NN, NN, CC, JJ, NN, JJ,... [NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN ... [NN, NNS, VBP, JJ, JJ, NN, NN, JJ, NN, JJ, NNS... [(NN, NNS), (NNS, VBP), (VBP, JJ), (JJ, JJ), (...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla... ... [(the, DT), (worst, JJS), (restaurant, NN), (t... [(worst, RBS), (restaurant, NN), (ever, RB), (... {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... {'the': 6, 'worst': 1, 'restaurant': 1, 'that'... {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... [DT, JJS, NN, IN, NN, VBP, RB, VBN, IN, VBZ, R... [DT JJS NN IN NN VBP RB VBN IN VBZ RB DT NN VB... [RBS, NN, RB, RB, JJ, NN, VBN, NN, NN, VBD, NN... [(DT, JJS), (JJS, NN), (NN, IN), (IN, NN), (NN...
4 I have been to a Asian restaurant in New York ... N [I have been to a Asian restaurant in New York... 4 [i, have, been, to, a, asian, restaurant, in, ... 45 [asian, restaurant, new, york, city, menu, wri... 23 [i, have, been, to, a, asian, restaur, in, new... [asian, restaur, new, york, citi, menu, writte... ... [(i, NNS), (have, VBP), (been, VBN), (to, TO),... [(asian, JJ), (restaurant, NN), (new, JJ), (yo... {'NNS': 2, 'VBP': 2, 'VBN': 5, 'TO': 1, 'DT': ... {'JJ': 5, 'NN': 11, 'VBN': 2, 'VBP': 1, 'NNS':... {'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3... {'asian': 1, 'restaurant': 1, 'new': 1, 'york'... [NNS, VBP, VBN, TO, DT, JJ, NN, IN, JJ, NN, NN... [NNS VBP VBN TO DT JJ NN IN JJ NN NN DT NN VBZ... [JJ, NN, JJ, NN, NN, NN, VBN, JJ, JJ, VBP, JJ,... [(NNS, VBP), (VBP, VBN), (VBN, TO), (TO, DT), ...
5 The best restaurant I have gone to is when I w... N [The best restaurant I have gone to is when I ... 6 [the, best, restaurant, i, have, gone, to, is,... 71 [best, restaurant, gone, went, applebee, frien... 30 [the, best, restaur, i, have, gone, to, is, wh... [best, restaur, gone, went, applebe, friend, s... ... [(the, DT), (best, JJS), (restaurant, NN), (i,... [(best, RBS), (restaurant, NN), (gone, VBN), (... {'DT': 6, 'JJS': 1, 'NN': 10, 'VBP': 2, 'VBN':... {'RBS': 1, 'NN': 7, 'VBN': 1, 'VBD': 2, 'JJ': ... {'the': 5, 'best': 1, 'restaurant': 2, 'i': 4,... {'best': 1, 'restaurant': 2, 'gone': 1, 'went'... [DT, JJS, NN, NN, VBP, VBN, TO, VBZ, WRB, JJ, ... [DT JJS NN NN VBP VBN TO VBZ WRB JJ VBD TO VB ... [RBS, NN, VBN, VBD, JJ, NNS, NN, RB, NN, JJ, V... [(DT, JJS), (JJS, NN), (NN, NN), (NN, VBP), (V...

4 rows × 22 columns

In [49]:
test = all_df['pos_sent_str'][1]
test
Out[49]:
['NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN VBZ RB JJ RB CC JJ VB JJ TO VB DT NN TO VB NN IN RB RB IN PRP VBP VBN IN DT JJ NNS DT NN VBP JJ IN PRP$ NNS CC NN TO VB NN NN']
In [50]:
text = ["this is a sentence", "so is this one"]
test2 = ["NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN VBZ RB JJ RB CC JJ VB JJ TO VB DT NN TO VB NN IN RB RB IN", "PRP VBP VBN IN DT JJ NNS DT NN VBP JJ IN PRP$ NNS CC NN TO VB NN NN"]
test1 = ['NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN VBZ RB JJ RB CC JJ VB JJ TO VB DT NN TO VB NN IN RB RB IN PRP VBP VBN IN DT JJ NNS DT NN VBP JJ IN PRP$ NNS CC NN TO VB NN NN']
bigrams = [b for l in test1 for b in zip(l.split(" ")[:-1], l.split(" ")[1:])]
print(bigrams)
[('NN', 'NNS'), ('NNS', 'VBP'), ('VBP', 'JJ'), ('JJ', 'JJ'), ('JJ', 'NN'), ('NN', 'NN'), ('NN', 'CC'), ('CC', 'JJ'), ('JJ', 'NN'), ('NN', 'JJ'), ('JJ', 'VBZ'), ('VBZ', 'DT'), ('DT', 'NN'), ('NN', 'VBZ'), ('VBZ', 'RB'), ('RB', 'JJ'), ('JJ', 'RB'), ('RB', 'CC'), ('CC', 'JJ'), ('JJ', 'VB'), ('VB', 'JJ'), ('JJ', 'TO'), ('TO', 'VB'), ('VB', 'DT'), ('DT', 'NN'), ('NN', 'TO'), ('TO', 'VB'), ('VB', 'NN'), ('NN', 'IN'), ('IN', 'RB'), ('RB', 'RB'), ('RB', 'IN'), ('IN', 'PRP'), ('PRP', 'VBP'), ('VBP', 'VBN'), ('VBN', 'IN'), ('IN', 'DT'), ('DT', 'JJ'), ('JJ', 'NNS'), ('NNS', 'DT'), ('DT', 'NN'), ('NN', 'VBP'), ('VBP', 'JJ'), ('JJ', 'IN'), ('IN', 'PRP$'), ('PRP$', 'NNS'), ('NNS', 'CC'), ('CC', 'NN'), ('NN', 'TO'), ('TO', 'VB'), ('VB', 'NN'), ('NN', 'NN')]
In [51]:
# all_bigrams = [bigram for bigram in all_df.pos_sent_bi.tolist()]
# flat_list = [item for sublist in l for item in sublist]
all_df_n = all_df[all_df['PoN'] == 'N']
all_df_p = all_df[all_df['PoN'] == 'P']
all_bigrams = [bigram for sublist in all_df.pos_sent_bi.tolist() for bigram in sublist]
all_bigrams_n = [bigram for sublist in all_df_n.pos_sent_bi.tolist() for bigram in sublist]
all_bigrams_p = [bigram for sublist in all_df_p.pos_sent_bi.tolist() for bigram in sublist]
all_bigrams[:5]
Out[51]:
[('NN', 'NNS'), ('NNS', 'VBP'), ('VBP', 'JJ'), ('JJ', 'JJ'), ('JJ', 'NN')]
In [52]:
count = Counter(all_bigrams)
count_n = Counter(all_bigrams_n)
count_p = Counter(all_bigrams_p)
In [53]:
count.most_common()[:5]

import numpy as np

# all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)
# new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])

# most_common_pos = [word[0] for word in big_bow_p.most_common(100)]
# print("Unique values in array1 that are not in array2:")
most_common_n = [word[0] for word in count_n.most_common(10)]
most_common_p = [word[0] for word in count_p.most_common(10)]

neg_notpos = np.setdiff1d(most_common_n, most_common_p)
neg_notpos

# all_bigrams_n
Out[53]:
array(['VBZ'], dtype='<U3')
In [54]:
all_df_n = all_df[all_df['PoN'] == 'N']
all_df_p = all_df[all_df['PoN'] == 'P']

big_bow = [item for review in all_df['pos_sent_bi'].tolist() for item in review]
big_bow_n = [item for review in all_df_n['pos_sent_bi'].tolist() for item in review]
big_bow_p = [item for review in all_df_p['pos_sent_bi'].tolist() for item in review]

df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()
df = df.rename(columns={'index':'word', 0:'count'})

df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()
df_n = df_n.rename(columns={'index':'word', 0:'count'})

df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()
df_p = df_p.rename(columns={'index':'word', 0:'count'})
In [55]:
print(bar_plot(df_p.sort_values(by=["count"], ascending=False)[:10], "Top 10 POS Bigrams (ALL)"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [56]:
print(bar_plot(df_n.sort_values(by=["count"], ascending=False)[:10], "Top 10 POS Bigrams (TRUE)"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [57]:
print(bar_plot(df_p.sort_values(by=["count"], ascending=False)[:10], "Top 10 POS Bigrams (FALSE)"))
<module 'matplotlib.pyplot' from '/usr/local/lib/python3.7/site-packages/matplotlib/pyplot.py'>
In [82]:
all_df['bow_pos'] = all_df.apply(lambda x: Counter(x['pos_sent_bi']), axis=1)

new_df = pd.DataFrame(all_df['bow_pos'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
Out[82]:
(NN, NNS) (NNS, VBP) (VBP, JJ) (JJ, JJ) (JJ, NN) (NN, NN) (NN, CC) (CC, JJ) (NN, JJ) (JJ, VBZ) ... (WDT, MD) (WRB, MD) (MD, DT) (NNS, JJR) (JJR, EX) (VBP, MD) (JJS, WRB) (CD, RB) (JJS, VBG) (RP, TO)
PoN
N 1 1 2 1 2 2 1 2 1 1 ... 0 0 0 0 0 0 0 0 0 0
N 1 0 0 0 2 4 3 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
N 0 1 0 0 2 4 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
N 0 0 0 0 4 1 0 1 0 0 ... 0 0 0 0 0 0 0 0 0 0
N 0 0 0 0 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 389 columns

In [83]:
get_NB(new_df, new_df.index)
Accuracy: 0.4444444444444444
In [88]:
bi_df_norm = new_df.copy()
bi_df_norm = bi_df_norm.apply(lambda x: x/x.sum(), axis=1)
# bi_df_norm = bi_df_norm.apply(lambda x: x/x.sum(), axis=1)
bi_df_norm

get_NB(bi_df_norm, bi_df_norm.index)
bi_df_norm
Accuracy: 0.48148148148148145
Out[88]:
(NN, NNS) (NNS, VBP) (VBP, JJ) (JJ, JJ) (JJ, NN) (NN, NN) (NN, CC) (CC, JJ) (NN, JJ) (JJ, VBZ) ... (WDT, MD) (WRB, MD) (MD, DT) (NNS, JJR) (JJR, EX) (VBP, MD) (JJS, WRB) (CD, RB) (JJS, VBG) (RP, TO)
PoN
N 0.019231 0.019231 0.038462 0.019231 0.038462 0.038462 0.019231 0.038462 0.019231 0.019231 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.000000 0.000000 0.00000
N 0.009615 0.000000 0.000000 0.000000 0.019231 0.038462 0.028846 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.000000 0.000000 0.00000
N 0.000000 0.022727 0.000000 0.000000 0.045455 0.090909 0.000000 0.022727 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.000000 0.000000 0.00000
N 0.000000 0.000000 0.000000 0.000000 0.057143 0.014286 0.000000 0.014286 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.000000 0.000000 0.00000
N 0.000000 0.000000 0.000000 0.000000 0.028571 0.000000 0.028571 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.000000 0.000000 0.00000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
P 0.000000 0.023810 0.023810 0.000000 0.071429 0.023810 0.023810 0.000000 0.023810 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.02381 0.000000 0.000000 0.00000
P 0.000000 0.000000 0.000000 0.000000 0.000000 0.043478 0.000000 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.000000 0.000000 0.00000
P 0.000000 0.000000 0.000000 0.000000 0.030612 0.020408 0.020408 0.010204 0.010204 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.010204 0.000000 0.00000
P 0.000000 0.006494 0.000000 0.012987 0.038961 0.071429 0.025974 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.00000 0.000000 0.006494 0.00000
P 0.000000 0.000000 0.023810 0.000000 0.119048 0.047619 0.023810 0.023810 0.000000 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.02381 0.000000 0.000000 0.02381

90 rows × 389 columns

In [85]:
df_p.sort_values(by=["count"], ascending=False)[:20]
Out[85]:
word count
11 (DT, NN) 245
38 (JJ, NN) 151
0 (NN, NN) 125
44 (NN, IN) 114
1 (NN, VBD) 112
43 (IN, DT) 111
5 (NN, CC) 77
19 (TO, VB) 76
37 (DT, JJ) 62
26 (RB, JJ) 61
105 (PRP, VBD) 60
29 (IN, NN) 57
2 (VBD, DT) 55
25 (VBD, RB) 45
75 (PRP$, NN) 44
12 (NN, VBZ) 42
83 (NN, RB) 38
86 (IN, JJ) 37
39 (CC, DT) 36
65 (JJ, CC) 34
In [62]:
df_n.sort_values(by=["count"], ascending=False)[:20]
Out[62]:
word count
11 (DT, NN) 230
4 (JJ, NN) 145
24 (NN, IN) 122
49 (NN, VBD) 106
5 (NN, NN) 102
32 (IN, DT) 96
33 (DT, JJ) 76
44 (IN, NN) 65
20 (TO, VB) 59
12 (NN, VBZ) 56
14 (RB, JJ) 53
59 (PRP, VBD) 48
6 (NN, CC) 48
60 (VBD, RB) 47
79 (NN, DT) 41
52 (VBD, DT) 37
28 (IN, PRP) 36
61 (JJ, CC) 35
74 (VBD, JJ) 34
57 (PRP$, NN) 33
In [63]:
from nltk import word_tokenize 
from nltk.util import ngrams

text = ['cant railway station', 'citadel hotel', 'police stn']
def get_ngram(line, num):
    token = nltk.word_tokenize(line)
    grams = list(ngrams(token, num)) 
    return(grams)

# all_df['trigrams'] = all_df.apply(lambda x: get_ngram(x[0],3), axis=1)
all_df['trigrams'] = all_df.apply(lambda x: get_ngram(' '.join(x['tokens']),3), axis=1)
all_df['trigrams_pos'] = all_df.apply(lambda x: get_ngram(' '.join(x['pos_sent']),3), axis=1)

# ' '.join(all_df['tokens'][1])
                                  
# counter = all_df['trigrams_pos']
In [64]:
all_df['trigrams_feats'] = all_df.apply(lambda x: ['_'.join(trigram) for trigram in x['trigrams_pos']], axis=1)
In [65]:
def flatten_column(df, column):
    return [features for row in df[column].tolist() for features in row]
#     [bigram for sublist in all_df_n.pos_sent_bi.tolist() for bigram in sublist]

flat_trigrams = Counter(flatten_column(all_df, 'trigrams_feats'))
flat_trigrams_n = Counter(flatten_column(all_df[all_df['PoN'] == 'N'], 'trigrams_feats'))
flat_trigrams_p = Counter(flatten_column(all_df[all_df['PoN'] == 'P'], 'trigrams_feats'))

most_common_n = [word[0] for word in flat_trigrams_n.most_common(10)]
most_common_p = [word[0] for word in flat_trigrams_p.most_common(10)]

neg_notpos = np.setdiff1d(most_common_n, most_common_p)
neg_notpos
Out[65]:
array(['JJ_NN_IN', 'NN_IN_DT', 'NN_IN_NN'], dtype='<U9')
In [72]:
all_df['trigrams_feats_bow'] = all_df.apply(lambda x: Counter(x['trigrams_feats']), axis=1)
In [77]:
new_df = pd.DataFrame(all_df['trigrams_feats_bow'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
Out[77]:
NN_NNS_VBP NNS_VBP_JJ VBP_JJ_JJ JJ_JJ_NN JJ_NN_NN NN_NN_CC NN_CC_JJ CC_JJ_NN JJ_NN_JJ NN_JJ_VBZ ... RB_NN_NN VBD_RP_PRP PRP_TO_JJ TO_JJ_JJ CC_VB_NN JJS_WRB_NN NN_VBP_RP VBP_RP_TO RP_TO_VB VBD_PRP_CC
PoN
N 1 1 1 1 1 1 1 1 1 1 ... 0 0 0 0 0 0 0 0 0 0
N 0 0 0 0 2 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
N 0 0 0 0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
N 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
N 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 1683 columns

In [78]:
get_NB(new_df, new_df.index)
Accuracy: 0.5185185185185185
In [80]:
tri_df_norm = new_df.copy()
tri_df_norm = tri_df_norm.apply(lambda x: x/x.sum(), axis=1)
tri_df_norm

get_NB(tri_df_norm, tri_df_norm.index)
tri_df_norm
Accuracy: 0.5185185185185185
Out[80]:
NN_NNS_VBP NNS_VBP_JJ VBP_JJ_JJ JJ_JJ_NN JJ_NN_NN NN_NN_CC NN_CC_JJ CC_JJ_NN JJ_NN_JJ NN_JJ_VBZ ... RB_NN_NN VBD_RP_PRP PRP_TO_JJ TO_JJ_JJ CC_VB_NN JJS_WRB_NN NN_VBP_RP VBP_RP_TO RP_TO_VB VBD_PRP_CC
PoN
N 0.019231 0.019231 0.019231 0.019231 0.019231 0.019231 0.019231 0.019231 0.019231 0.019231 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 0.00000 0.00000
N 0.000000 0.000000 0.000000 0.000000 0.018868 0.009434 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 0.00000 0.00000
N 0.000000 0.000000 0.000000 0.000000 0.023256 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 0.00000 0.00000
N 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 0.00000 0.00000
N 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 0.00000 0.00000
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
P 0.000000 0.024390 0.000000 0.000000 0.000000 0.024390 0.000000 0.000000 0.024390 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 0.00000 0.00000
P 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 0.00000 0.00000
P 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 0.00000 0.00000
P 0.000000 0.000000 0.000000 0.012346 0.030864 0.012346 0.000000 0.000000 0.000000 0.000000 ... 0.006173 0.006173 0.006173 0.006173 0.006173 0.00000 0.00000 0.00000 0.00000 0.00000
P 0.000000 0.000000 0.000000 0.000000 0.048780 0.000000 0.024390 0.024390 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.02439 0.02439 0.02439 0.02439 0.02439

90 rows × 1683 columns

In [92]:
all_df
Out[92]:
0 PoN sentences num_sentences tokens num_tokens no_sw num_no_sw stemmed stemmed_no_sw ... bow_no_sw pos_sent pos_sent_str pos_no_sw_sent pos_sent_bi bow_pos trigrams trigrams_pos trigrams_feats trigrams_feats_bow
1 Twin Trees Cicero NY HUGE salad bar and high q... N [Twin Trees Cicero NY HUGE salad bar and high ... 4 [twin, trees, cicero, ny, huge, salad, bar, an... 53 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 [twin, tree, cicero, ny, huge, salad, bar, and... [twin, tree, cicero, ny, huge, salad, bar, hig... ... {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... [NN, NNS, VBP, JJ, JJ, NN, NN, CC, JJ, NN, JJ,... [NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN ... [NN, NNS, VBP, JJ, JJ, NN, NN, JJ, NN, JJ, NNS... [(NN, NNS), (NNS, VBP), (VBP, JJ), (JJ, JJ), (... {('NN', 'NNS'): 1, ('NNS', 'VBP'): 1, ('VBP', ... [(twin, trees, cicero), (trees, cicero, ny), (... [(NN, NNS, VBP), (NNS, VBP, JJ), (VBP, JJ, JJ)... [NN_NNS_VBP, NNS_VBP_JJ, VBP_JJ_JJ, JJ_JJ_NN, ... {'NN_NNS_VBP': 1, 'NNS_VBP_JJ': 1, 'VBP_JJ_JJ'...
2 The worst restaurant that I have ever eaten in... N [The worst restaurant that I have ever eaten i... 5 [the, worst, restaurant, that, i, have, ever, ... 105 [worst, restaurant, ever, eaten, undoubtedly, ... 49 [the, worst, restaur, that, i, have, ever, eat... [worst, restaur, ever, eaten, undoubtedli, pla... ... {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... [DT, JJS, NN, IN, NN, VBP, RB, VBN, IN, VBZ, R... [DT JJS NN IN NN VBP RB VBN IN VBZ RB DT NN VB... [RBS, NN, RB, RB, JJ, NN, VBN, NN, NN, VBD, NN... [(DT, JJS), (JJS, NN), (NN, IN), (IN, NN), (NN... {('DT', 'JJS'): 1, ('JJS', 'NN'): 1, ('NN', 'I... [(the, worst, restaurant), (worst, restaurant,... [(DT, JJS, NN), (JJS, NN, IN), (NN, IN, NN), (... [DT_JJS_NN, JJS_NN_IN, NN_IN_NN, IN_NN_VBP, NN... {'DT_JJS_NN': 1, 'JJS_NN_IN': 1, 'NN_IN_NN': 2...
4 I have been to a Asian restaurant in New York ... N [I have been to a Asian restaurant in New York... 4 [i, have, been, to, a, asian, restaurant, in, ... 45 [asian, restaurant, new, york, city, menu, wri... 23 [i, have, been, to, a, asian, restaur, in, new... [asian, restaur, new, york, citi, menu, writte... ... {'asian': 1, 'restaurant': 1, 'new': 1, 'york'... [NNS, VBP, VBN, TO, DT, JJ, NN, IN, JJ, NN, NN... [NNS VBP VBN TO DT JJ NN IN JJ NN NN DT NN VBZ... [JJ, NN, JJ, NN, NN, NN, VBN, JJ, JJ, VBP, JJ,... [(NNS, VBP), (VBP, VBN), (VBN, TO), (TO, DT), ... {('NNS', 'VBP'): 1, ('VBP', 'VBN'): 1, ('VBN',... [(i, have, been), (have, been, to), (been, to,... [(NNS, VBP, VBN), (VBP, VBN, TO), (VBN, TO, DT... [NNS_VBP_VBN, VBP_VBN_TO, VBN_TO_DT, TO_DT_JJ,... {'NNS_VBP_VBN': 1, 'VBP_VBN_TO': 1, 'VBN_TO_DT...
5 The best restaurant I have gone to is when I w... N [The best restaurant I have gone to is when I ... 6 [the, best, restaurant, i, have, gone, to, is,... 71 [best, restaurant, gone, went, applebee, frien... 30 [the, best, restaur, i, have, gone, to, is, wh... [best, restaur, gone, went, applebe, friend, s... ... {'best': 1, 'restaurant': 2, 'gone': 1, 'went'... [DT, JJS, NN, NN, VBP, VBN, TO, VBZ, WRB, JJ, ... [DT JJS NN NN VBP VBN TO VBZ WRB JJ VBD TO VB ... [RBS, NN, VBN, VBD, JJ, NNS, NN, RB, NN, JJ, V... [(DT, JJS), (JJS, NN), (NN, NN), (NN, VBP), (V... {('DT', 'JJS'): 1, ('JJS', 'NN'): 1, ('NN', 'N... [(the, best, restaurant), (best, restaurant, i... [(DT, JJS, NN), (JJS, NN, NN), (NN, NN, VBP), ... [DT_JJS_NN, JJS_NN_NN, NN_NN_VBP, NN_VBP_VBN, ... {'DT_JJS_NN': 1, 'JJS_NN_NN': 1, 'NN_NN_VBP': ...
6 The restaurant looked pretty good the people a... N [The restaurant looked pretty good the people ... 3 [the, restaurant, looked, pretty, good, the, p... 36 [restaurant, looked, pretty, good, people, aro... 19 [the, restaur, look, pretti, good, the, peopl,... [restaur, look, pretti, good, peopl, around, a... ... {'restaurant': 1, 'looked': 1, 'pretty': 1, 'g... [DT, NN, VBD, RB, JJ, DT, NNS, IN, PRP, DT, NN... [DT NN VBD RB JJ DT NNS IN PRP DT NN CC VBD RB... [NN, VBD, RB, JJ, NNS, IN, NN, VBD, RB, NN, JJ... [(DT, NN), (NN, VBD), (VBD, RB), (RB, JJ), (JJ... {('DT', 'NN'): 5, ('NN', 'VBD'): 3, ('VBD', 'R... [(the, restaurant, looked), (restaurant, looke... [(DT, NN, VBD), (NN, VBD, RB), (VBD, RB, JJ), ... [DT_NN_VBD, NN_VBD_RB, VBD_RB_JJ, RB_JJ_DT, JJ... {'DT_NN_VBD': 3, 'NN_VBD_RB': 1, 'VBD_RB_JJ': ...
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
87 Mikes Pizza High Point NY Service was very slo... P [Mikes Pizza High Point NY Service was very sl... 4 [mikes, pizza, high, point, ny, service, was, ... 43 [mikes, pizza, high, point, ny, service, slow,... 26 [mike, pizza, high, point, ny, servic, wa, ver... [mike, pizza, high, point, ny, servic, slow, q... ... {'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1... [NNS, VBP, JJ, NN, JJ, NN, VBD, RB, JJ, CC, DT... [NNS VBP JJ NN JJ NN VBD RB JJ CC DT NN VBD JJ... [NNS, VBP, JJ, NN, JJ, NN, JJ, NN, NN, MD, VB,... [(NNS, VBP), (VBP, JJ), (JJ, NN), (NN, JJ), (J... {('NNS', 'VBP'): 1, ('VBP', 'JJ'): 1, ('JJ', '... [(mikes, pizza, high), (pizza, high, point), (... [(NNS, VBP, JJ), (VBP, JJ, NN), (JJ, NN, JJ), ... [NNS_VBP_JJ, VBP_JJ_NN, JJ_NN_JJ, NN_JJ_NN, JJ... {'NNS_VBP_JJ': 1, 'VBP_JJ_NN': 1, 'JJ_NN_JJ': ...
88 After I went shopping with some of my friend w... P [After I went shopping with some of my friend ... 2 [after, i, went, shopping, with, some, of, my,... 24 [went, shopping, friend, went, dodo, restauran... 11 [after, i, went, shop, with, some, of, my, fri... [went, shop, friend, went, dodo, restaur, dinn... ... {'went': 2, 'shopping': 1, 'friend': 1, 'dodo'... [IN, JJ, VBD, VBG, IN, DT, IN, PRP$, NN, PRP, ... [IN JJ VBD VBG IN DT IN PRP$ NN PRP VBD TO VB ... [VBD, VBG, NN, VBD, JJ, NN, NN, VBD, RB, CD, NNS] [(IN, JJ), (JJ, VBD), (VBD, VBG), (VBG, IN), (... {('IN', 'JJ'): 1, ('JJ', 'VBD'): 1, ('VBD', 'V... [(after, i, went), (i, went, shopping), (went,... [(IN, JJ, VBD), (JJ, VBD, VBG), (VBD, VBG, IN)... [IN_JJ_VBD, JJ_VBD_VBG, VBD_VBG_IN, VBG_IN_DT,... {'IN_JJ_VBD': 1, 'JJ_VBD_VBG': 1, 'VBD_VBG_IN'...
89 I entered the restaurant and a waitress came b... P [I entered the restaurant and a waitress came ... 5 [i, entered, the, restaurant, and, a, waitress... 99 [entered, restaurant, waitress, came, blanking... 49 [i, enter, the, restaur, and, a, waitress, cam... [enter, restaur, waitress, came, blank, look, ... ... {'entered': 1, 'restaurant': 1, 'waitress': 2,... [NN, VBD, DT, NN, CC, DT, NN, VBD, IN, IN, DT,... [NN VBD DT NN CC DT NN VBD IN IN DT NN VBG CC ... [VBN, NN, NN, VBD, VBG, VBG, JJ, NN, NN, VBD, ... [(NN, VBD), (VBD, DT), (DT, NN), (NN, CC), (CC... {('NN', 'VBD'): 5, ('VBD', 'DT'): 4, ('DT', 'N... [(i, entered, the), (entered, the, restaurant)... [(NN, VBD, DT), (VBD, DT, NN), (DT, NN, CC), (... [NN_VBD_DT, VBD_DT_NN, DT_NN_CC, NN_CC_DT, CC_... {'NN_VBD_DT': 1, 'VBD_DT_NN': 3, 'DT_NN_CC': 1...
90 Carlos Plate Shack was the worst dining experi... P [Carlos Plate Shack was the worst dining exper... 9 [carlos, plate, shack, was, the, worst, dining... 155 [carlos, plate, shack, worst, dining, experien... 88 [carlo, plate, shack, wa, the, worst, dine, ex... [carlo, plate, shack, worst, dine, experi, lif... ... {'carlos': 1, 'plate': 6, 'shack': 1, 'worst':... [NN, NN, NN, VBD, DT, JJS, VBG, NN, IN, PRP$, ... [NN NN NN VBD DT JJS VBG NN IN PRP$ NN IN PRP$... [NN, NN, NN, JJS, VBG, NN, NN, IN, JJ, NN, NN,... [(NN, NN), (NN, NN), (NN, VBD), (VBD, DT), (DT... {('NN', 'NN'): 11, ('NN', 'VBD'): 6, ('VBD', '... [(carlos, plate, shack), (plate, shack, was), ... [(NN, NN, NN), (NN, NN, VBD), (NN, VBD, DT), (... [NN_NN_NN, NN_NN_VBD, NN_VBD_DT, VBD_DT_JJS, D... {'NN_NN_NN': 2, 'NN_NN_VBD': 3, 'NN_VBD_DT': 2...
91 Olive Oil Garden was very disappointing. I exp... P [Olive Oil Garden was very disappointing., I e... 5 [olive, oil, garden, was, very, disappointing,... 43 [olive, oil, garden, disappointing, expect, go... 23 [oliv, oil, garden, wa, veri, disappoint, i, e... [oliv, oil, garden, disappoint, expect, good, ... ... {'olive': 2, 'oil': 2, 'garden': 2, 'disappoin... [JJ, NN, NN, VBD, RB, JJ, NN, VBP, JJ, NN, CC,... [JJ NN NN VBD RB JJ NN VBP JJ NN CC JJ NN IN J... [JJ, NN, NN, NN, VBP, JJ, NN, JJ, NN, JJS, VB,... [(JJ, NN), (NN, NN), (NN, VBD), (VBD, RB), (RB... {('JJ', 'NN'): 5, ('NN', 'NN'): 2, ('NN', 'VBD... [(olive, oil, garden), (oil, garden, was), (ga... [(JJ, NN, NN), (NN, NN, VBD), (NN, VBD, RB), (... [JJ_NN_NN, NN_NN_VBD, NN_VBD_RB, VBD_RB_JJ, RB... {'JJ_NN_NN': 2, 'NN_NN_VBD': 1, 'NN_VBD_RB': 1...

90 rows × 27 columns

In [ ]: