import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file)
results.append(f.read())
f.close()
return results
neg = get_data_from_files('../NEG_JK_E/')
pos = get_data_from_files('../POS_JK_E/')
print('Neg Reviews:', len(neg))
print('Pos Reviews:', len(pos))
import pandas as pd
neg_df = pd.DataFrame(neg)
pos_df = pd.DataFrame(pos)
pos_df['PoN'] = 'P'
neg_df['PoN'] = 'N'
all_df = neg_df.append(pos_df)
all_df[:5]
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
def get_tokens(sentence):
tokens = word_tokenize(sentence)
clean_tokens = [word.lower() for word in tokens if word.isalpha()]
return clean_tokens
all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)
all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)
all_df[:5]
from nltk.tokenize import casual_tokenize
from collections import Counter
# all_df['bow'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)
all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)
all_df[:5]
freq_df = pd.DataFrame(all_df['bow'].tolist())
freq_df = freq_df.fillna(0).astype(int)
freq_df[:5]
from sklearn.feature_extraction.text import TfidfTransformer
tfidf =TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)
data =tfidf.fit_transform(freq_df.values)
tfidf_reduced = pd.DataFrame(data.todense())
tfidf_reduced[:5]
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
def get_NB(small_df, labels):
x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
get_NB(tfidf_reduced, all_df['PoN'])