NOTE: May need to change delimiter based on the data file
import pandas as pd
df = pd.read_csv('deception_data_converted_final.csv', sep='\t')
df[:5]
def get_labels(row):
split_row = str(row).split(',')
lie = split_row[0]
sentiment = split_row[1]
return [lie, sentiment, split_row[2:]]
df['all'] = df.apply(lambda row: get_labels(row['lie,sentiment,review']), axis=1)
df[:5]
df['lie'] = df.apply(lambda row: row['all'][0][0], axis=1)
df[:5]
df['sentiment'] = df.apply(lambda row: row['all'][1][0], axis=1)
df[:5]
df['review'] = df.apply(lambda row: ''.join(row['all'][2]), axis=1)
df[:5]
clean_df = df.copy()
clean_df.drop(['lie,sentiment,review', 'all'], axis=1, inplace=True)
clean_df
def clean_rogue_characters(string):
exclude = ['\\',"\'",'"']
string = ''.join(string.split('\\n'))
string = ''.join(ch for ch in string if ch not in exclude)
return string
clean_df['review'] = clean_df['review'].apply( lambda x: clean_rogue_characters(x) )
clean_df['review'][0]
clean_df.to_csv('hw4_data.csv',index=False)
df = pd.read_csv('hw4_data.csv')
df[:5]
lie_df_f = df[df['lie'] == 'f']
lie_df_t = df[df['lie'] == 't']
sent_df_n = df[df['sentiment'] == 'n']
sent_df_p = df[df['sentiment'] == 'p']
def print_to_file(rating, review, num, title):
both = review
output_filename = str(rating) + '_'+ title +'_' + str(num) + '.txt'
outfile = open(output_filename, 'w')
outfile.write(both)
outfile.close()
def export_to_corpus(df, subj, title):
for num,row in enumerate(df['review']):
print_to_file(subj, row, num, title)
export_to_corpus(sent_df_n, 'neg', 'hw4_n')
export_to_corpus(sent_df_p, 'pos', 'hw4_p')
export_to_corpus(lie_df_f, 'false', 'hw4_f')
export_to_corpus(lie_df_t, 'true', 'hw4_t')