import pandas as pd
dirtyFile = pd.read_csv('moviereviewRAW.csv')
dirtyFile[:2]
df = pd.DataFrame()
df['all'] = dirtyFile[dirtyFile.columns[0:]].apply(
lambda x: ','.join(x.dropna().astype(str)),
axis=1)
df[:4]
df['label'] = df.apply(lambda x: x['all'][-3], axis=1)
df[:4]
def clean_rogue_characters(string):
exclude = ['\\',"\'",'"']
string = ''.join(string.split('\\n'))
string = ''.join(ch for ch in string if ch not in exclude)
return string
df['all'] = df['all'].apply( lambda x: clean_rogue_characters(x) )
df['all'][0]
def print_to_file(rating, review, num):
both = review
output_filename = str(rating) + '_dirty_' + str(num) + '.txt'
outfile = open(output_filename, 'w')
outfile.write(both)
outfile.close()
for num,row in enumerate(df['all']):
print_to_file(row[-3], row[:-3], num)
from nltk.tokenize import casual_tokenize
from collections import Counter
df['bow'] = df.apply(lambda x: Counter(casual_tokenize(x['all'])), axis=1)
freq_df = pd.DataFrame(df['bow'].tolist())
freq_df = freq_df.fillna(0).astype(int)
freq_df['DF_total'] = freq_df.apply(lambda x: sum(x), axis=1)
freq_df['DF_label'] = df['label']
# freq_df = freq_df.append(df['label'])
freq_df[:5]
Do I want to normalize on document? On corpus? On positive corpus?
normalized_df = freq_df.copy()
# normalized_df = normalized_df[:10]
# normalized_df.reset_index()
normalized_df_label = normalized_df['DF_label']
normalized_df_no_label = normalized_df.drop('DF_label', axis = 1)
normalized_df_no_label
normalized_df_no_label = normalized_df_no_label.apply(lambda row: row/row['DF_total'] , axis=1)
normalized_df_label
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
def get_NB(small_df, labels):
x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)
gnb = GaussianNB()
gnb.fit(x_train, y_train)
y_pred = gnb.predict(x_test)
from sklearn import metrics
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
# normalized_df_label = normalized_df['label']
# normalized_df_no_label = normalized_df.drop('label', axis=1)
get_NB(normalized_df_no_label, normalized_df_label)
normalized_df_no_label