{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW2: VECTORIZATION (Pandas style!)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 1: Import ALL the things\n", "### Import libraries " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "##########################################\n", "# NOTE: I'm toying with the idea of requiring the library just above \n", "# when I use it so it makes more sense in context\n", "##########################################\n", "# import os\n", "# import pandas as pd\n", "# from nltk.tokenize import word_tokenize, sent_tokenize\n", "# from nltk.sentiment import SentimentAnalyzer\n", "# from nltk.sentiment.util import *\n", "# from nltk.probability import FreqDist\n", "# from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", "# sid = SentimentIntensityAnalyzer()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import data from files" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [], "source": [ "import os\n", "def get_data_from_files(path):\n", " directory = os.listdir(path)\n", " results = []\n", " for file in directory:\n", " f=open(path+file)\n", " results.append(f.read())\n", " f.close()\n", " return results\n", "\n", "# neg = get_data_from_files('../neg_cornell/')\n", "# pos = get_data_from_files('../pos_cornell/')\n", "\n", "# v1\n", "# neg = get_data_from_files('../hw4_lie_false/')\n", "# pos = get_data_from_files('../hw4_lie_true/')\n", "\n", "pos = get_data_from_files('../hw4_lie_false/')\n", "neg = get_data_from_files('../hw4_lie_true/')\n", "\n", "# neg = get_data_from_files('../neg_hw4/')\n", "# pos = get_data_from_files('../pos_hw4/')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 2: Prep Data\n", "### STEP 2a: Turn that fresh text into a pandas DF" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "neg_df = pd.DataFrame(neg)\n", "pos_df = pd.DataFrame(pos)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 2b: Label it" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [], "source": [ "pos_df['PoN'] = 'P'\n", "neg_df['PoN'] = 'N'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 2c: Combine the dfs" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "all_df = neg_df.append(pos_df)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoN
0?N
1Twin Trees Cicero NY HUGE salad bar and high q...N
2The worst restaurant that I have ever eaten in...N
3?N
4I have been to a Asian restaurant in New York ...N
.........
41Mikes Pizza High Point NY Service was very slo...P
42After I went shopping with some of my friend w...P
43I entered the restaurant and a waitress came b...P
44Carlos Plate Shack was the worst dining experi...P
45Olive Oil Garden was very disappointing. I exp...P
\n", "

92 rows × 2 columns

\n", "
" ], "text/plain": [ " 0 PoN\n", "0 ? N\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N\n", "2 The worst restaurant that I have ever eaten in... N\n", "3 ? N\n", "4 I have been to a Asian restaurant in New York ... N\n", ".. ... ..\n", "41 Mikes Pizza High Point NY Service was very slo... P\n", "42 After I went shopping with some of my friend w... P\n", "43 I entered the restaurant and a waitress came b... P\n", "44 Carlos Plate Shack was the worst dining experi... P\n", "45 Olive Oil Garden was very disappointing. I exp... P\n", "\n", "[92 rows x 2 columns]" ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 3: TOKENIZE (and clean)!!" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize, sent_tokenize\n", "from nltk.sentiment import SentimentAnalyzer\n", "from nltk.sentiment.util import *" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "## Came back and added sentences for tokinization for \"Summary experiment\"\n", "def get_sentence_tokens(review):\n", " return sent_tokenize(review)\n", " \n", "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n", "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n", "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokens
0?N[?]1[]0
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105
3?N[?]1[]0
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45
.....................
41Mikes Pizza High Point NY Service was very slo...P[Mikes Pizza High Point NY Service was very sl...4[mikes, pizza, high, point, ny, service, was, ...43
42After I went shopping with some of my friend w...P[After I went shopping with some of my friend ...2[after, i, went, shopping, with, some, of, my,...24
43I entered the restaurant and a waitress came b...P[I entered the restaurant and a waitress came ...5[i, entered, the, restaurant, and, a, waitress...99
44Carlos Plate Shack was the worst dining experi...P[Carlos Plate Shack was the worst dining exper...9[carlos, plate, shack, was, the, worst, dining...155
45Olive Oil Garden was very disappointing. I exp...P[Olive Oil Garden was very disappointing., I e...5[olive, oil, garden, was, very, disappointing,...43
\n", "

92 rows × 6 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "3 ? N \n", "4 I have been to a Asian restaurant in New York ... N \n", ".. ... .. \n", "41 Mikes Pizza High Point NY Service was very slo... P \n", "42 After I went shopping with some of my friend w... P \n", "43 I entered the restaurant and a waitress came b... P \n", "44 Carlos Plate Shack was the worst dining experi... P \n", "45 Olive Oil Garden was very disappointing. I exp... P \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "3 [?] 1 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", ".. ... ... \n", "41 [Mikes Pizza High Point NY Service was very sl... 4 \n", "42 [After I went shopping with some of my friend ... 2 \n", "43 [I entered the restaurant and a waitress came ... 5 \n", "44 [Carlos Plate Shack was the worst dining exper... 9 \n", "45 [Olive Oil Garden was very disappointing., I e... 5 \n", "\n", " tokens num_tokens \n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "3 [] 0 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", ".. ... ... \n", "41 [mikes, pizza, high, point, ny, service, was, ... 43 \n", "42 [after, i, went, shopping, with, some, of, my,... 24 \n", "43 [i, entered, the, restaurant, and, a, waitress... 99 \n", "44 [carlos, plate, shack, was, the, worst, dining... 155 \n", "45 [olive, oil, garden, was, very, disappointing,... 43 \n", "\n", "[92 rows x 6 columns]" ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 4: Remove Stopwords" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words(\"english\"))\n", "def remove_stopwords(sentence):\n", " filtered_text = []\n", " for word in sentence:\n", " if word not in stop_words:\n", " filtered_text.append(word)\n", " return filtered_text\n", "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n", "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_sw
0?N[?]1[]0[]0
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49
3?N[?]1[]0[]0
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45[asian, restaurant, new, york, city, menu, wri...23
...........................
41Mikes Pizza High Point NY Service was very slo...P[Mikes Pizza High Point NY Service was very sl...4[mikes, pizza, high, point, ny, service, was, ...43[mikes, pizza, high, point, ny, service, slow,...26
42After I went shopping with some of my friend w...P[After I went shopping with some of my friend ...2[after, i, went, shopping, with, some, of, my,...24[went, shopping, friend, went, dodo, restauran...11
43I entered the restaurant and a waitress came b...P[I entered the restaurant and a waitress came ...5[i, entered, the, restaurant, and, a, waitress...99[entered, restaurant, waitress, came, blanking...49
44Carlos Plate Shack was the worst dining experi...P[Carlos Plate Shack was the worst dining exper...9[carlos, plate, shack, was, the, worst, dining...155[carlos, plate, shack, worst, dining, experien...88
45Olive Oil Garden was very disappointing. I exp...P[Olive Oil Garden was very disappointing., I e...5[olive, oil, garden, was, very, disappointing,...43[olive, oil, garden, disappointing, expect, go...23
\n", "

92 rows × 8 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "3 ? N \n", "4 I have been to a Asian restaurant in New York ... N \n", ".. ... .. \n", "41 Mikes Pizza High Point NY Service was very slo... P \n", "42 After I went shopping with some of my friend w... P \n", "43 I entered the restaurant and a waitress came b... P \n", "44 Carlos Plate Shack was the worst dining experi... P \n", "45 Olive Oil Garden was very disappointing. I exp... P \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "3 [?] 1 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", ".. ... ... \n", "41 [Mikes Pizza High Point NY Service was very sl... 4 \n", "42 [After I went shopping with some of my friend ... 2 \n", "43 [I entered the restaurant and a waitress came ... 5 \n", "44 [Carlos Plate Shack was the worst dining exper... 9 \n", "45 [Olive Oil Garden was very disappointing., I e... 5 \n", "\n", " tokens num_tokens \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "3 [] 0 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", ".. ... ... \n", "41 [mikes, pizza, high, point, ny, service, was, ... 43 \n", "42 [after, i, went, shopping, with, some, of, my,... 24 \n", "43 [i, entered, the, restaurant, and, a, waitress... 99 \n", "44 [carlos, plate, shack, was, the, worst, dining... 155 \n", "45 [olive, oil, garden, was, very, disappointing,... 43 \n", "\n", " no_sw num_no_sw \n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "3 [] 0 \n", "4 [asian, restaurant, new, york, city, menu, wri... 23 \n", ".. ... ... \n", "41 [mikes, pizza, high, point, ny, service, slow,... 26 \n", "42 [went, shopping, friend, went, dodo, restauran... 11 \n", "43 [entered, restaurant, waitress, came, blanking... 49 \n", "44 [carlos, plate, shack, worst, dining, experien... 88 \n", "45 [olive, oil, garden, disappointing, expect, go... 23 \n", "\n", "[92 rows x 8 columns]" ] }, "execution_count": 93, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 5: Create a Frequency Distribution" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [], "source": [ "from nltk.probability import FreqDist\n", "def get_most_common(tokens):\n", " fdist = FreqDist(tokens)\n", " return fdist.most_common(12)\n", "all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)" ] }, { "cell_type": "code", "execution_count": 95, "metadata": { "scrolled": true }, "outputs": [], "source": [ "def get_most_common(tokens):\n", " fdist = FreqDist(tokens)\n", " return fdist.most_common(12)\n", "all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [], "source": [ "def get_fdist(tokens):\n", " return (FreqDist(tokens))\n", " \n", "all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)\n", "all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swtopwords_unfiltopwords_filfreq_distfreq_dist_unfil
0?N[?]1[]0[]0[][]{}{}
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[(and, 3), (to, 3), (are, 2), (the, 2), (twin,...[(twin, 1), (trees, 1), (cicero, 1), (ny, 1), ...{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[(the, 6), (i, 6), (a, 5), (was, 5), (and, 4),...[(pepper, 3), (veggie, 2), (sandwich, 2), (red...{'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...{'the': 6, 'worst': 1, 'restaurant': 1, 'that'...
3?N[?]1[]0[]0[][]{}{}
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45[asian, restaurant, new, york, city, menu, wri...23[(i, 3), (a, 3), (the, 2), (is, 2), (by, 2), (...[(asian, 1), (restaurant, 1), (new, 1), (york,...{'asian': 1, 'restaurant': 1, 'new': 1, 'york'...{'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3...
.......................................
41Mikes Pizza High Point NY Service was very slo...P[Mikes Pizza High Point NY Service was very sl...4[mikes, pizza, high, point, ny, service, was, ...43[mikes, pizza, high, point, ny, service, slow,...26[(pizza, 2), (was, 2), (you, 2), (would, 2), (...[(pizza, 2), (would, 2), (mikes, 1), (high, 1)...{'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1...{'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1...
42After I went shopping with some of my friend w...P[After I went shopping with some of my friend ...2[after, i, went, shopping, with, some, of, my,...24[went, shopping, friend, went, dodo, restauran...11[(i, 2), (went, 2), (of, 2), (after, 1), (shop...[(went, 2), (shopping, 1), (friend, 1), (dodo,...{'went': 2, 'shopping': 1, 'friend': 1, 'dodo'...{'after': 1, 'i': 2, 'went': 2, 'shopping': 1,...
43I entered the restaurant and a waitress came b...P[I entered the restaurant and a waitress came ...5[i, entered, the, restaurant, and, a, waitress...99[entered, restaurant, waitress, came, blanking...49[(the, 9), (i, 6), (and, 6), (to, 4), (a, 2), ...[(waitress, 2), (waited, 2), (even, 2), (food,...{'entered': 1, 'restaurant': 1, 'waitress': 2,...{'i': 6, 'entered': 1, 'the': 9, 'restaurant':...
44Carlos Plate Shack was the worst dining experi...P[Carlos Plate Shack was the worst dining exper...9[carlos, plate, shack, was, the, worst, dining...155[carlos, plate, shack, worst, dining, experien...88[(the, 9), (to, 7), (plate, 6), (and, 5), (my,...[(plate, 6), (southern, 3), (comfort, 3), (ext...{'carlos': 1, 'plate': 6, 'shack': 1, 'worst':...{'carlos': 1, 'plate': 6, 'shack': 1, 'was': 3...
45Olive Oil Garden was very disappointing. I exp...P[Olive Oil Garden was very disappointing., I e...5[olive, oil, garden, was, very, disappointing,...43[olive, oil, garden, disappointing, expect, go...23[(the, 3), (olive, 2), (oil, 2), (garden, 2), ...[(olive, 2), (oil, 2), (garden, 2), (good, 2),...{'olive': 2, 'oil': 2, 'garden': 2, 'disappoin...{'olive': 2, 'oil': 2, 'garden': 2, 'was': 2, ...
\n", "

92 rows × 12 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "3 ? N \n", "4 I have been to a Asian restaurant in New York ... N \n", ".. ... .. \n", "41 Mikes Pizza High Point NY Service was very slo... P \n", "42 After I went shopping with some of my friend w... P \n", "43 I entered the restaurant and a waitress came b... P \n", "44 Carlos Plate Shack was the worst dining experi... P \n", "45 Olive Oil Garden was very disappointing. I exp... P \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "3 [?] 1 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", ".. ... ... \n", "41 [Mikes Pizza High Point NY Service was very sl... 4 \n", "42 [After I went shopping with some of my friend ... 2 \n", "43 [I entered the restaurant and a waitress came ... 5 \n", "44 [Carlos Plate Shack was the worst dining exper... 9 \n", "45 [Olive Oil Garden was very disappointing., I e... 5 \n", "\n", " tokens num_tokens \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "3 [] 0 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", ".. ... ... \n", "41 [mikes, pizza, high, point, ny, service, was, ... 43 \n", "42 [after, i, went, shopping, with, some, of, my,... 24 \n", "43 [i, entered, the, restaurant, and, a, waitress... 99 \n", "44 [carlos, plate, shack, was, the, worst, dining... 155 \n", "45 [olive, oil, garden, was, very, disappointing,... 43 \n", "\n", " no_sw num_no_sw \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "3 [] 0 \n", "4 [asian, restaurant, new, york, city, menu, wri... 23 \n", ".. ... ... \n", "41 [mikes, pizza, high, point, ny, service, slow,... 26 \n", "42 [went, shopping, friend, went, dodo, restauran... 11 \n", "43 [entered, restaurant, waitress, came, blanking... 49 \n", "44 [carlos, plate, shack, worst, dining, experien... 88 \n", "45 [olive, oil, garden, disappointing, expect, go... 23 \n", "\n", " topwords_unfil \\\n", "0 [] \n", "1 [(and, 3), (to, 3), (are, 2), (the, 2), (twin,... \n", "2 [(the, 6), (i, 6), (a, 5), (was, 5), (and, 4),... \n", "3 [] \n", "4 [(i, 3), (a, 3), (the, 2), (is, 2), (by, 2), (... \n", ".. ... \n", "41 [(pizza, 2), (was, 2), (you, 2), (would, 2), (... \n", "42 [(i, 2), (went, 2), (of, 2), (after, 1), (shop... \n", "43 [(the, 9), (i, 6), (and, 6), (to, 4), (a, 2), ... \n", "44 [(the, 9), (to, 7), (plate, 6), (and, 5), (my,... \n", "45 [(the, 3), (olive, 2), (oil, 2), (garden, 2), ... \n", "\n", " topwords_fil \\\n", "0 [] \n", "1 [(twin, 1), (trees, 1), (cicero, 1), (ny, 1), ... \n", "2 [(pepper, 3), (veggie, 2), (sandwich, 2), (red... \n", "3 [] \n", "4 [(asian, 1), (restaurant, 1), (new, 1), (york,... \n", ".. ... \n", "41 [(pizza, 2), (would, 2), (mikes, 1), (high, 1)... \n", "42 [(went, 2), (shopping, 1), (friend, 1), (dodo,... \n", "43 [(waitress, 2), (waited, 2), (even, 2), (food,... \n", "44 [(plate, 6), (southern, 3), (comfort, 3), (ext... \n", "45 [(olive, 2), (oil, 2), (garden, 2), (good, 2),... \n", "\n", " freq_dist \\\n", "0 {} \n", "1 {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... \n", "2 {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... \n", "3 {} \n", "4 {'asian': 1, 'restaurant': 1, 'new': 1, 'york'... \n", ".. ... \n", "41 {'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1... \n", "42 {'went': 2, 'shopping': 1, 'friend': 1, 'dodo'... \n", "43 {'entered': 1, 'restaurant': 1, 'waitress': 2,... \n", "44 {'carlos': 1, 'plate': 6, 'shack': 1, 'worst':... \n", "45 {'olive': 2, 'oil': 2, 'garden': 2, 'disappoin... \n", "\n", " freq_dist_unfil \n", "0 {} \n", "1 {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... \n", "2 {'the': 6, 'worst': 1, 'restaurant': 1, 'that'... \n", "3 {} \n", "4 {'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3... \n", ".. ... \n", "41 {'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1... \n", "42 {'after': 1, 'i': 2, 'went': 2, 'shopping': 1,... \n", "43 {'i': 6, 'entered': 1, 'the': 9, 'restaurant':... \n", "44 {'carlos': 1, 'plate': 6, 'shack': 1, 'was': 3... \n", "45 {'olive': 2, 'oil': 2, 'garden': 2, 'was': 2, ... \n", "\n", "[92 rows x 12 columns]" ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 6: Try Different Sentiment Analysis Tools" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### VADER" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [], "source": [ "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", "sid = SentimentIntensityAnalyzer()\n", "def get_vader_score(review):\n", " return sid.polarity_scores(review)\n", "\n", "all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [], "source": [ "def separate_vader_score(vader_score, key):\n", " return vader_score[key]\n", "\n", "all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)\n", "all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)\n", "all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)\n", "all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DIY SUMMARY" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "17 Halos is home. I have been here numerous times...\n", "17 I went to Joeys and had the best lasagna on th...\n", "Name: 0, dtype: object" ] }, "execution_count": 100, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[0][17]" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [], "source": [ "def get_weighted_freq_dist(review, freq_dist):\n", " try:\n", " max_freq = max(freq_dist.values())\n", " for word in freq_dist.keys():\n", " freq_dist[word] = (freq_dist[word]/max_freq)\n", " return freq_dist\n", " except:\n", " return 'nope'\n", "\n", "all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "def get_sentence_score(review, freq_dist):\n", " sentence_scores = {}\n", " for sent in review:\n", " for word in nltk.word_tokenize(sent.lower()):\n", " if word in freq_dist.keys():\n", " if len(sent.split(' ')) < 30:\n", " if sent not in sentence_scores.keys():\n", " sentence_scores[sent] = freq_dist[word]\n", " else:\n", " sentence_scores[sent] += freq_dist[word]\n", " return sentence_scores\n", "\n", "all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)" ] }, { "cell_type": "code", "execution_count": 103, "metadata": {}, "outputs": [], "source": [ "def get_summary_sentences(sentence_scores):\n", " sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)\n", " return ''.join(sent[0] for sent in sorted_sentences[:5])\n", "\n", "all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ "summaries = all_df['summary_sentences'].tolist()" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "''" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summaries[3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Doing VADER on the Summary Section" ] }, { "cell_type": "code", "execution_count": 106, "metadata": {}, "outputs": [], "source": [ "all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)\n", "all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)\n", "all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)\n", "all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Doing VADER on the Most Frequent Words" ] }, { "cell_type": "code", "execution_count": 108, "metadata": {}, "outputs": [], "source": [ "def get_freq_words(freq_dist):\n", " sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)\n", " return ' '.join(word[0] for word in sorted_words[:50])\n", "\n", "all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)\n", "\n", "all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)\n", "all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)\n", "all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)\n", "all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)\n", "all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 7: Test `Step 6` with Machine Learning!!\n", "### Naive Bayes" ] }, { "cell_type": "code", "execution_count": 109, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import GaussianNB\n", "\n", "def get_NB(small_df, labels):\n", " x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n", "\n", " gnb = GaussianNB()\n", " gnb.fit(x_train, y_train)\n", " y_pred = gnb.predict(x_test)\n", " from sklearn import metrics\n", " print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 1: Vader Scores (Original)" ] }, { "cell_type": "code", "execution_count": 110, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu']) # 0.645\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 2: Vader Scores (from Summary)" ] }, { "cell_type": "code", "execution_count": 111, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6071428571428571\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) # 0.59\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 3: Vader Scores (original) AND Vader Scores (summary)" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5714285714285714\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n", " 'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 4: Vader Scores (50 most frequent -- filtered -- words)" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6428571428571429\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd']) # 0.598\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 5: All `compound` Vader Scores" ] }, { "cell_type": "code", "execution_count": 114, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6071428571428571\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_fd','v_compound_sum', 'v_compound']) # 0.615\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 6: ALL THE NUMBERS!!" ] }, { "cell_type": "code", "execution_count": 115, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6071428571428571\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n", " 'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', \n", " 'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.613\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 7: Test UNFILTERED most frequent words" ] }, { "cell_type": "code", "execution_count": 116, "metadata": {}, "outputs": [], "source": [ "def get_freq_words(freq_dist):\n", " sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)\n", " return ' '.join(word[0] for word in sorted_words[:50])\n", "\n", "all_df['v_freq_words_unfil'] = all_df.apply(lambda x: get_freq_words(x['freq_dist_unfil']), axis=1)\n", "\n", "all_df['vader_fd_all_unfil'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words_unfil']),axis=1)\n", "\n", "all_df['v_compound_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'compound'),axis=1)\n", "all_df['v_neg_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neg'),axis=1)\n", "all_df['v_neu_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neu'),axis=1)\n", "all_df['v_pos_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'pos'),axis=1)" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6071428571428571\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n", " 'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', \n", " 'v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf',\n", " 'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "code", "execution_count": 118, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5357142857142857\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf']) # 0.603\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [], "source": [ "summaries_pos = all_df[all_df['PoN'] == 'P']\n", "summaries_neg = all_df[all_df['PoN'] == 'N']" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "summaries_pos_list = summaries_pos['summary_sentences'].tolist()\n", "summaries_neg_list = summaries_neg['summary_sentences'].tolist()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 8: Test NLTK: Naive Bayes from HW1" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [], "source": [ "from nltk.classify import NaiveBayesClassifier\n", "from nltk.tokenize import word_tokenize\n", "from nltk.sentiment import SentimentAnalyzer\n", "from nltk.sentiment.util import *\n", "\n", "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "def get_nltk_train_test(array, label, num_train):\n", " tokens = [get_tokens(sentence) for sentence in array]\n", " docs = [(sent, label) for sent in tokens]\n", " train_docs = docs[:num_train]\n", " test_docs = docs[num_train:len(array)]\n", " return [train_docs, test_docs]\n", "\n", "\n", "def get_nltk_NB(NEG_DATA, POS_DATA, num_train):\n", " train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg', num_train)\n", " train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos', num_train)\n", "\n", " training_docs = train_neg + train_pos\n", " testing_docs = test_neg + test_pos\n", "\n", " sentim_analyzer = SentimentAnalyzer()\n", " all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])\n", " unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)\n", " sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)\n", " training_set = sentim_analyzer.apply_features(training_docs)\n", " test_set = sentim_analyzer.apply_features(testing_docs)\n", "\n", " trainer = NaiveBayesClassifier.train\n", " classifier = sentim_analyzer.train(trainer, training_set)\n", " \n", " results = []\n", " for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):\n", " print('{0}: {1}'.format(key,value))" ] }, { "cell_type": "code", "execution_count": 136, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training classifier\n", "Evaluating NaiveBayesClassifier results...\n", "Accuracy: 0.5714285714285714\n", "F-measure [neg]: 0.5714285714285714\n", "F-measure [pos]: 0.5714285714285714\n", "Precision [neg]: 0.5714285714285714\n", "Precision [pos]: 0.5714285714285714\n", "Recall [neg]: 0.5714285714285714\n", "Recall [pos]: 0.5714285714285714\n" ] } ], "source": [ "neg_df = all_df[all_df['PoN'] == 'N']\n", "neg_df_list = neg_df[0].tolist()\n", "\n", "pos_df = all_df[all_df['PoN'] == 'P']\n", "pos_df_list = pos_df[0].tolist()\n", "\n", "import math\n", "\n", "percent_train = 0.7 if (len(pos_df) < 200) else 0.8\n", "train_size = math.floor(len(pos_df)*percent_train)\n", "get_nltk_NB(neg_df_list, pos_df_list, train_size)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }