import os
def get_data(file, path):
f=open(path+file)
data = f.read()
f.close()
return data
def get_data_from_files(path):
results = [get_data(file, path) for file in os.listdir(path)]
return results
# pos = get_data_from_files('../pos_cornell//')
# neg = get_data_from_files('../neg_cornell/')
pos = get_data_from_files('../hw4_lie_false/')
neg = get_data_from_files('../hw4_lie_true/')
import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
all_df.reset_index(drop=True,inplace=True)
all_df
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
def get_sentence_tokens(review):
return sent_tokenize(review)
all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)
all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)
def get_tokens(sentence):
tokens = word_tokenize(sentence)
clean_tokens = [word.lower() for word in tokens if word.isalpha()]
return clean_tokens
all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)
all_df[:3]
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
def remove_stopwords(sentence):
filtered_text = []
for word in sentence:
if word not in stop_words:
filtered_text.append(word)
return filtered_text
all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)
all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)
all_df[:3]
from nltk.stem import PorterStemmer
def get_stems(sentence):
ps = PorterStemmer()
return [ps.stem(w) for w in sentence]
all_df['stemmed'] = all_df.apply(lambda x: get_stems(x['tokens']),axis=1)
all_df['stemmed_no_sw'] = all_df.apply(lambda x: get_stems(x['no_sw']),axis=1)
all_df[:3]
from nltk.stem.wordnet import WordNetLemmatizer
def get_lemmas(sentence):
lem = WordNetLemmatizer()
return [lem.lemmatize(w) for w in sentence]
all_df['lemmed'] = all_df.apply(lambda x: get_lemmas(x['tokens']),axis=1)
all_df['lemmed_no_sw'] = all_df.apply(lambda x: get_lemmas(x['no_sw']),axis=1)
all_df[:3]
all_df['pos'] = all_df.apply(lambda x: nltk.pos_tag(x['tokens']),axis=1)
all_df['pos_no_sw'] = all_df.apply(lambda x: nltk.pos_tag(x['no_sw']),axis=1)
def get_pos_dict(pos_tuple):
pos_dict = {}
for t in pos_tuple:
if t[1] in pos_dict.keys():
pos_dict[t[1]] += 1
else:
pos_dict.update({t[1]: 1})
return pos_dict
all_df['pos_dict'] = all_df.apply(lambda x: get_pos_dict(x['pos']), axis=1)
all_df['pos_dict_no_sw'] = all_df.apply(lambda x: get_pos_dict(x['pos_no_sw']), axis=1)
all_df[:3]
def get_bow_from_tokens(df, column):
all_column_data = ' '.join(df[column].tolist())
all_column_fd = Counter(all_column_data.split())
return all_column_fd
# bow = get_bow_from_column(all_df, 'diy_cleaner')
# bow =
from collections import Counter
all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)
all_df['bow_no_sw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)
all_df[:3]
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
def get_NB(small_df, labels):
x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
new_df = pd.DataFrame(all_df['pos_dict'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
get_NB(new_df, new_df.index)
new_df[:5]
# def normalize_df(df):
# df["total"] = df.sum(axis = 1)
# df = df.apply(lambda row: row/row["total"], axis = 1)
# df.drop("total", axis=1, inplace = True)
# return(df)
def normalize_df(df):
names = df.columns
df["total"] = df.sum(axis = 1)
for name in names:
df[name] = df[name]/df["total"]
df.drop("total", axis =1 , inplace = True)
return(df)
norm_df = normalize_df(new_df)
# new_df['total'] = new_df.sum(axis = 1)
# new_df_norm = new_df.copy()
# new_df_norm = new_df_norm.apply(lambda x: x/x['total'], axis=1)
# new_df_norm = new_df_norm.drop('total', axis=1)
# norm_df = norm_df.fillna(0).astype(int)
# get_NB(new_df_norm, new_df_norm.index)
# new_df_norm[:5]
norm_df
# new_df
new_df = pd.DataFrame(all_df['bow_no_sw'].tolist(), all_df['PoN'])
new_df = new_df.fillna(0).astype(int)
new_df[:5]
get_NB(new_df, new_df.index)