{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW4 -- Sentiment and Lies" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 1: Import the data\n", "NOTE: May need to change delimiter based on the data file " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lie,sentiment,review
0f,n,'Mike\\'s Pizza High Point, NY Service was ...
1f,n,'i really like this buffet restaurant in M...
2f,n,'After I went shopping with some of my fri...
3f,n,'Olive Oil Garden was very disappointing. ...
4f,n,'The Seven Heaven restaurant was never kno...
\n", "
" ], "text/plain": [ " lie,sentiment,review\n", "0 f,n,'Mike\\'s Pizza High Point, NY Service was ...\n", "1 f,n,'i really like this buffet restaurant in M...\n", "2 f,n,'After I went shopping with some of my fri...\n", "3 f,n,'Olive Oil Garden was very disappointing. ...\n", "4 f,n,'The Seven Heaven restaurant was never kno..." ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "df = pd.read_csv('deception_data_converted_final.csv', sep='\\t')\n", "df[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 2: Pull out the labels" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lie,sentiment,reviewall
0f,n,'Mike\\'s Pizza High Point, NY Service was ...[f, n, ['Mike\\'s Pizza High Point, NY Service...
1f,n,'i really like this buffet restaurant in M...[f, n, ['i really like this buffet restaurant ...
2f,n,'After I went shopping with some of my fri...[f, n, ['After I went shopping with some of my...
3f,n,'Olive Oil Garden was very disappointing. ...[f, n, ['Olive Oil Garden was very disappointi...
4f,n,'The Seven Heaven restaurant was never kno...[f, n, ['The Seven Heaven restaurant was never...
\n", "
" ], "text/plain": [ " lie,sentiment,review \\\n", "0 f,n,'Mike\\'s Pizza High Point, NY Service was ... \n", "1 f,n,'i really like this buffet restaurant in M... \n", "2 f,n,'After I went shopping with some of my fri... \n", "3 f,n,'Olive Oil Garden was very disappointing. ... \n", "4 f,n,'The Seven Heaven restaurant was never kno... \n", "\n", " all \n", "0 [f, n, ['Mike\\'s Pizza High Point, NY Service... \n", "1 [f, n, ['i really like this buffet restaurant ... \n", "2 [f, n, ['After I went shopping with some of my... \n", "3 [f, n, ['Olive Oil Garden was very disappointi... \n", "4 [f, n, ['The Seven Heaven restaurant was never... " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_labels(row):\n", " split_row = str(row).split(',')\n", " lie = split_row[0]\n", " sentiment = split_row[1]\n", " return [lie, sentiment, split_row[2:]]\n", "\n", "df['all'] = df.apply(lambda row: get_labels(row['lie,sentiment,review']), axis=1)\n", "df[:5]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lie,sentiment,reviewalllie
0f,n,'Mike\\'s Pizza High Point, NY Service was ...[f, n, ['Mike\\'s Pizza High Point, NY Service...f
1f,n,'i really like this buffet restaurant in M...[f, n, ['i really like this buffet restaurant ...f
2f,n,'After I went shopping with some of my fri...[f, n, ['After I went shopping with some of my...f
3f,n,'Olive Oil Garden was very disappointing. ...[f, n, ['Olive Oil Garden was very disappointi...f
4f,n,'The Seven Heaven restaurant was never kno...[f, n, ['The Seven Heaven restaurant was never...f
\n", "
" ], "text/plain": [ " lie,sentiment,review \\\n", "0 f,n,'Mike\\'s Pizza High Point, NY Service was ... \n", "1 f,n,'i really like this buffet restaurant in M... \n", "2 f,n,'After I went shopping with some of my fri... \n", "3 f,n,'Olive Oil Garden was very disappointing. ... \n", "4 f,n,'The Seven Heaven restaurant was never kno... \n", "\n", " all lie \n", "0 [f, n, ['Mike\\'s Pizza High Point, NY Service... f \n", "1 [f, n, ['i really like this buffet restaurant ... f \n", "2 [f, n, ['After I went shopping with some of my... f \n", "3 [f, n, ['Olive Oil Garden was very disappointi... f \n", "4 [f, n, ['The Seven Heaven restaurant was never... f " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['lie'] = df.apply(lambda row: row['all'][0][0], axis=1)\n", "df[:5]" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lie,sentiment,reviewallliesentiment
0f,n,'Mike\\'s Pizza High Point, NY Service was ...[f, n, ['Mike\\'s Pizza High Point, NY Service...fn
1f,n,'i really like this buffet restaurant in M...[f, n, ['i really like this buffet restaurant ...fn
2f,n,'After I went shopping with some of my fri...[f, n, ['After I went shopping with some of my...fn
3f,n,'Olive Oil Garden was very disappointing. ...[f, n, ['Olive Oil Garden was very disappointi...fn
4f,n,'The Seven Heaven restaurant was never kno...[f, n, ['The Seven Heaven restaurant was never...fn
\n", "
" ], "text/plain": [ " lie,sentiment,review \\\n", "0 f,n,'Mike\\'s Pizza High Point, NY Service was ... \n", "1 f,n,'i really like this buffet restaurant in M... \n", "2 f,n,'After I went shopping with some of my fri... \n", "3 f,n,'Olive Oil Garden was very disappointing. ... \n", "4 f,n,'The Seven Heaven restaurant was never kno... \n", "\n", " all lie sentiment \n", "0 [f, n, ['Mike\\'s Pizza High Point, NY Service... f n \n", "1 [f, n, ['i really like this buffet restaurant ... f n \n", "2 [f, n, ['After I went shopping with some of my... f n \n", "3 [f, n, ['Olive Oil Garden was very disappointi... f n \n", "4 [f, n, ['The Seven Heaven restaurant was never... f n " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['sentiment'] = df.apply(lambda row: row['all'][1][0], axis=1)\n", "df[:5]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
lie,sentiment,reviewallliesentimentreview
0f,n,'Mike\\'s Pizza High Point, NY Service was ...[f, n, ['Mike\\'s Pizza High Point, NY Service...fn'Mike\\'s Pizza High Point NY Service was very ...
1f,n,'i really like this buffet restaurant in M...[f, n, ['i really like this buffet restaurant ...fn'i really like this buffet restaurant in Marsh...
2f,n,'After I went shopping with some of my fri...[f, n, ['After I went shopping with some of my...fn'After I went shopping with some of my friend ...
3f,n,'Olive Oil Garden was very disappointing. ...[f, n, ['Olive Oil Garden was very disappointi...fn'Olive Oil Garden was very disappointing. I ex...
4f,n,'The Seven Heaven restaurant was never kno...[f, n, ['The Seven Heaven restaurant was never...fn'The Seven Heaven restaurant was never known f...
\n", "
" ], "text/plain": [ " lie,sentiment,review \\\n", "0 f,n,'Mike\\'s Pizza High Point, NY Service was ... \n", "1 f,n,'i really like this buffet restaurant in M... \n", "2 f,n,'After I went shopping with some of my fri... \n", "3 f,n,'Olive Oil Garden was very disappointing. ... \n", "4 f,n,'The Seven Heaven restaurant was never kno... \n", "\n", " all lie sentiment \\\n", "0 [f, n, ['Mike\\'s Pizza High Point, NY Service... f n \n", "1 [f, n, ['i really like this buffet restaurant ... f n \n", "2 [f, n, ['After I went shopping with some of my... f n \n", "3 [f, n, ['Olive Oil Garden was very disappointi... f n \n", "4 [f, n, ['The Seven Heaven restaurant was never... f n \n", "\n", " review \n", "0 'Mike\\'s Pizza High Point NY Service was very ... \n", "1 'i really like this buffet restaurant in Marsh... \n", "2 'After I went shopping with some of my friend ... \n", "3 'Olive Oil Garden was very disappointing. I ex... \n", "4 'The Seven Heaven restaurant was never known f... " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['review'] = df.apply(lambda row: ''.join(row['all'][2]), axis=1)\n", "df[:5]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "clean_df = df.copy()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "clean_df.drop(['lie,sentiment,review', 'all'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
liesentimentreview
0fn'Mike\\'s Pizza High Point NY Service was very ...
1fn'i really like this buffet restaurant in Marsh...
2fn'After I went shopping with some of my friend ...
3fn'Olive Oil Garden was very disappointing. I ex...
4fn'The Seven Heaven restaurant was never known f...
............
87tp'Pastablities is a locally owned restaurant in...
88tp'I like the Pizza at Dominoes for their specia...
89tp'It was a really amazing Japanese restaurant. ...
90tp'How do I even pick a best experience at Joe\\'...
91tp'My sister and I ate at this restaurant called...
\n", "

92 rows × 3 columns

\n", "
" ], "text/plain": [ " lie sentiment review\n", "0 f n 'Mike\\'s Pizza High Point NY Service was very ...\n", "1 f n 'i really like this buffet restaurant in Marsh...\n", "2 f n 'After I went shopping with some of my friend ...\n", "3 f n 'Olive Oil Garden was very disappointing. I ex...\n", "4 f n 'The Seven Heaven restaurant was never known f...\n", ".. .. ... ...\n", "87 t p 'Pastablities is a locally owned restaurant in...\n", "88 t p 'I like the Pizza at Dominoes for their specia...\n", "89 t p 'It was a really amazing Japanese restaurant. ...\n", "90 t p 'How do I even pick a best experience at Joe\\'...\n", "91 t p 'My sister and I ate at this restaurant called...\n", "\n", "[92 rows x 3 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "clean_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 3: Clean the data" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Mikes Pizza High Point NY Service was very slow and the quality was low. You would think they would know at least how to make good pizza not. Stick to pre-made dishes like stuffed pasta or a salad. You should consider dining else where.'" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def clean_rogue_characters(string):\n", " exclude = ['\\\\',\"\\'\",'\"']\n", " string = ''.join(string.split('\\\\n'))\n", " string = ''.join(ch for ch in string if ch not in exclude)\n", " return string\n", "\n", "clean_df['review'] = clean_df['review'].apply( lambda x: clean_rogue_characters(x) )\n", "clean_df['review'][0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 4: Export cleaned, formatted CSV" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "clean_df.to_csv('hw4_data.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
liesentimentreview
0fnMikes Pizza High Point NY Service was very slo...
1fni really like this buffet restaurant in Marsha...
2fnAfter I went shopping with some of my friend w...
3fnOlive Oil Garden was very disappointing. I exp...
4fnThe Seven Heaven restaurant was never known fo...
\n", "
" ], "text/plain": [ " lie sentiment review\n", "0 f n Mikes Pizza High Point NY Service was very slo...\n", "1 f n i really like this buffet restaurant in Marsha...\n", "2 f n After I went shopping with some of my friend w...\n", "3 f n Olive Oil Garden was very disappointing. I exp...\n", "4 f n The Seven Heaven restaurant was never known fo..." ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('hw4_data.csv')\n", "df[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 5: Split df into data sets " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### LIE DFs" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "lie_df_f = df[df['lie'] == 'f']\n", "lie_df_t = df[df['lie'] == 't']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### SENTIMENT DFs" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "sent_df_n = df[df['sentiment'] == 'n']\n", "sent_df_p = df[df['sentiment'] == 'p']" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 5b: Export to Corpus to run on current pipelines" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "def print_to_file(rating, review, num, title):\n", " both = review\n", " output_filename = str(rating) + '_'+ title +'_' + str(num) + '.txt'\n", " outfile = open(output_filename, 'w')\n", " outfile.write(both)\n", " outfile.close()\n", "\n", "def export_to_corpus(df, subj, title):\n", " for num,row in enumerate(df['review']):\n", " print_to_file(subj, row, num, title)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "export_to_corpus(sent_df_n, 'neg', 'hw4_n')\n", "export_to_corpus(sent_df_p, 'pos', 'hw4_p')" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "export_to_corpus(lie_df_f, 'false', 'hw4_f')\n", "export_to_corpus(lie_df_t, 'true', 'hw4_t')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }