NOTE: May need to change delimiter based on the data file
import pandas as pd
df = pd.read_csv('deception_data_converted_final.csv', sep='\t')
df[:5]
def get_labels(row):
split_row = str(row).split(',')
lie = split_row[0]
sentiment = split_row[1]
return [lie, sentiment, split_row[2:]]
df['all'] = df.apply(lambda row: get_labels(row['lie,sentiment,review']), axis=1)
df[:5]
df['lie'] = df.apply(lambda row: row['all'][0][0], axis=1)
df[:5]
df['sentiment'] = df.apply(lambda row: row['all'][1][0], axis=1)
df[:5]
df['review'] = df.apply(lambda row: ''.join(row['all'][2]), axis=1)
df[:5]
clean_df = df.copy()
clean_df.drop(['lie,sentiment,review', 'all'], axis=1, inplace=True)
clean_df
def clean_rogue_characters(string):
exclude = ['\\',"\'",'"']
string = ''.join(string.split('\\n'))
string = ''.join(ch for ch in string if ch not in exclude)
return string
clean_df['review'] = clean_df['review'].apply( lambda x: clean_rogue_characters(x) )
clean_df['review'][0]
clean_df.to_csv('hw4_data.csv',index=False)
df = pd.read_csv('hw4_data.csv')
df[:5]
lie_df_f = df[df['lie'] == 'f']
lie_df_t = df[df['lie'] == 't']
sent_df_n = df[df['sentiment'] == 'n']
sent_df_p = df[df['sentiment'] == 'p']