{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW4 [Deception] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 1: GET THAT DATA" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "import os\n", "def get_data(file, path):\n", " f=open(path+file)\n", " data = f.read()\n", " f.close()\n", " return data\n", " \n", "def get_data_from_files(path):\n", " results = [get_data(file, path) for file in os.listdir(path)]\n", " return results\n", "\n", "# pos = get_data_from_files('../pos_cornell//')\n", "# neg = get_data_from_files('../neg_cornell/')\n", "pos = get_data_from_files('../hw4_lie_false/')\n", "neg = get_data_from_files('../hw4_lie_true/')" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoN
0?N
1Twin Trees Cicero NY HUGE salad bar and high q...N
2The worst restaurant that I have ever eaten in...N
3?N
4I have been to a Asian restaurant in New York ...N
.........
87Mikes Pizza High Point NY Service was very slo...P
88After I went shopping with some of my friend w...P
89I entered the restaurant and a waitress came b...P
90Carlos Plate Shack was the worst dining experi...P
91Olive Oil Garden was very disappointing. I exp...P
\n", "

92 rows × 2 columns

\n", "
" ], "text/plain": [ " 0 PoN\n", "0 ? N\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N\n", "2 The worst restaurant that I have ever eaten in... N\n", "3 ? N\n", "4 I have been to a Asian restaurant in New York ... N\n", ".. ... ..\n", "87 Mikes Pizza High Point NY Service was very slo... P\n", "88 After I went shopping with some of my friend w... P\n", "89 I entered the restaurant and a waitress came b... P\n", "90 Carlos Plate Shack was the worst dining experi... P\n", "91 Olive Oil Garden was very disappointing. I exp... P\n", "\n", "[92 rows x 2 columns]" ] }, "execution_count": 173, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "neg_df = pd.DataFrame(neg)\n", "pos_df = pd.DataFrame(pos)\n", "pos_df['PoN'] = 'P'\n", "neg_df['PoN'] = 'N'\n", "all_df = neg_df.append(pos_df)\n", "all_df.reset_index(drop=True,inplace=True)\n", "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 2: TOKENIZE" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize, sent_tokenize\n", "from nltk.sentiment import SentimentAnalyzer\n", "from nltk.sentiment.util import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 2a by sentence" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [], "source": [ "def get_sentence_tokens(review):\n", " return sent_tokenize(review)\n", " \n", "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n", "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 2b by word" ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [], "source": [ "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n", "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)" ] }, { "cell_type": "code", "execution_count": 177, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokens
0?N[?]1[]0
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "\n", " tokens num_tokens \n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 " ] }, "execution_count": 177, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 3: EXPERIMENT\n", "#### Experiment with: stopwords, stemming, lemming etc." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 3a remove english stopwords" ] }, { "cell_type": "code", "execution_count": 178, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words(\"english\"))\n", "def remove_stopwords(sentence):\n", " filtered_text = []\n", " for word in sentence:\n", " if word not in stop_words:\n", " filtered_text.append(word)\n", " return filtered_text\n", "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n", "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 179, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_sw
0?N[?]1[]0[]0
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "\n", " tokens num_tokens \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "\n", " no_sw num_no_sw \n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 " ] }, "execution_count": 179, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 3b get stems for both tokens and no_sw" ] }, { "cell_type": "code", "execution_count": 180, "metadata": {}, "outputs": [], "source": [ "from nltk.stem import PorterStemmer\n", "def get_stems(sentence):\n", " ps = PorterStemmer()\n", " return [ps.stem(w) for w in sentence]\n", " \n", "all_df['stemmed'] = all_df.apply(lambda x: get_stems(x['tokens']),axis=1)\n", "all_df['stemmed_no_sw'] = all_df.apply(lambda x: get_stems(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_sw
0?N[?]1[]0[]0[][]
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "\n", " tokens num_tokens \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "\n", " no_sw num_no_sw \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "\n", " stemmed \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "\n", " stemmed_no_sw \n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... " ] }, "execution_count": 181, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 3c get lemmas for both tokens and no_sw" ] }, { "cell_type": "code", "execution_count": 182, "metadata": {}, "outputs": [], "source": [ "from nltk.stem.wordnet import WordNetLemmatizer\n", "def get_lemmas(sentence):\n", " lem = WordNetLemmatizer() \n", " return [lem.lemmatize(w) for w in sentence]\n", " \n", "all_df['lemmed'] = all_df.apply(lambda x: get_lemmas(x['tokens']),axis=1)\n", "all_df['lemmed_no_sw'] = all_df.apply(lambda x: get_lemmas(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_swlemmedlemmed_no_sw
0?N[?]1[]0[]0[][][][]
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla...[the, worst, restaurant, that, i, have, ever, ...[worst, restaurant, ever, eaten, undoubtedly, ...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "\n", " tokens num_tokens \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "\n", " no_sw num_no_sw \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "\n", " stemmed \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "\n", " stemmed_no_sw \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... \n", "\n", " lemmed \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaurant, that, i, have, ever, ... \n", "\n", " lemmed_no_sw \n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... " ] }, "execution_count": 183, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [], "source": [ "all_df['pos'] = all_df.apply(lambda x: nltk.pos_tag(x['tokens']),axis=1)\n", "all_df['pos_no_sw'] = all_df.apply(lambda x: nltk.pos_tag(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_swlemmedlemmed_no_swpospos_no_swpos_dictpos_dict_no_sw
0?N[?]1[]0[]0[][][][][][]{}{}
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...{'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ...{'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla...[the, worst, restaurant, that, i, have, ever, ...[worst, restaurant, ever, eaten, undoubtedly, ...[(the, DT), (worst, JJS), (restaurant, NN), (t...[(worst, RBS), (restaurant, NN), (ever, RB), (...{'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':...{'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "\n", " tokens num_tokens \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "\n", " no_sw num_no_sw \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "\n", " stemmed \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "\n", " stemmed_no_sw \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... \n", "\n", " lemmed \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaurant, that, i, have, ever, ... \n", "\n", " lemmed_no_sw \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... \n", "\n", " pos \\\n", "0 [] \n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(the, DT), (worst, JJS), (restaurant, NN), (t... \n", "\n", " pos_no_sw \\\n", "0 [] \n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(worst, RBS), (restaurant, NN), (ever, RB), (... \n", "\n", " pos_dict \\\n", "0 {} \n", "1 {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... \n", "2 {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... \n", "\n", " pos_dict_no_sw \n", "0 {} \n", "1 {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... \n", "2 {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... " ] }, "execution_count": 185, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_pos_dict(pos_tuple):\n", " pos_dict = {}\n", " for t in pos_tuple:\n", " if t[1] in pos_dict.keys():\n", " pos_dict[t[1]] += 1\n", " else:\n", " pos_dict.update({t[1]: 1})\n", " return pos_dict\n", "\n", "all_df['pos_dict'] = all_df.apply(lambda x: get_pos_dict(x['pos']), axis=1)\n", "all_df['pos_dict_no_sw'] = all_df.apply(lambda x: get_pos_dict(x['pos_no_sw']), axis=1)\n", "all_df[:3]" ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_swlemmedlemmed_no_swpospos_no_swpos_dictpos_dict_no_swbowbow_no_sw
0?N[?]1[]0[]0[][][][][][]{}{}{}{}
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...{'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ...{'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla...[the, worst, restaurant, that, i, have, ever, ...[worst, restaurant, ever, eaten, undoubtedly, ...[(the, DT), (worst, JJS), (restaurant, NN), (t...[(worst, RBS), (restaurant, NN), (ever, RB), (...{'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':...{'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...{'the': 6, 'worst': 1, 'restaurant': 1, 'that'...{'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "\n", " tokens num_tokens \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "\n", " no_sw num_no_sw \\\n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "\n", " stemmed \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "\n", " stemmed_no_sw \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... \n", "\n", " lemmed \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaurant, that, i, have, ever, ... \n", "\n", " lemmed_no_sw \\\n", "0 [] \n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... \n", "\n", " pos \\\n", "0 [] \n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(the, DT), (worst, JJS), (restaurant, NN), (t... \n", "\n", " pos_no_sw \\\n", "0 [] \n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(worst, RBS), (restaurant, NN), (ever, RB), (... \n", "\n", " pos_dict \\\n", "0 {} \n", "1 {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... \n", "2 {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... \n", "\n", " pos_dict_no_sw \\\n", "0 {} \n", "1 {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... \n", "2 {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... \n", "\n", " bow \\\n", "0 {} \n", "1 {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... \n", "2 {'the': 6, 'worst': 1, 'restaurant': 1, 'that'... \n", "\n", " bow_no_sw \n", "0 {} \n", "1 {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... \n", "2 {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... " ] }, "execution_count": 198, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_bow_from_tokens(df, column):\n", " all_column_data = ' '.join(df[column].tolist())\n", " all_column_fd = Counter(all_column_data.split())\n", " return all_column_fd\n", "\n", "# bow = get_bow_from_column(all_df, 'diy_cleaner')\n", "# bow =\n", "from collections import Counter\n", "all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n", "all_df['bow_no_sw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)\n", "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 4: TEST EXPERIMENTS!!" ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import GaussianNB\n", "\n", "def get_NB(small_df, labels):\n", " x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n", "\n", " gnb = GaussianNB()\n", " gnb.fit(x_train, y_train)\n", " y_pred = gnb.predict(x_test)\n", " from sklearn import metrics\n", " print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 204, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5714285714285714\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNNNSVBPJJCCVBZDTRBVBTO...VBGEXJJRPDTRPWPCDRBRMDRBS
PoN
N0000000000...0000000000
N11339324443...0000000000
N291175114844...1000000000
N0000000000...0000000000
N13225125001...0000000000
\n", "

5 rows × 28 columns

\n", "
" ], "text/plain": [ " NN NNS VBP JJ CC VBZ DT RB VB TO ... VBG EX JJR PDT RP \\\n", "PoN ... \n", "N 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 \n", "N 11 3 3 9 3 2 4 4 4 3 ... 0 0 0 0 0 \n", "N 29 1 1 7 5 1 14 8 4 4 ... 1 0 0 0 0 \n", "N 0 0 0 0 0 0 0 0 0 0 ... 0 0 0 0 0 \n", "N 13 2 2 5 1 2 5 0 0 1 ... 0 0 0 0 0 \n", "\n", " WP CD RBR MD RBS \n", "PoN \n", "N 0 0 0 0 0 \n", "N 0 0 0 0 0 \n", "N 0 0 0 0 0 \n", "N 0 0 0 0 0 \n", "N 0 0 0 0 0 \n", "\n", "[5 rows x 28 columns]" ] }, "execution_count": 204, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df = pd.DataFrame(all_df['pos_dict'].tolist(), all_df['PoN'])\n", "new_df = new_df.fillna(0).astype(int)\n", "get_NB(new_df, new_df.index)\n", "new_df[:5]" ] }, { "cell_type": "code", "execution_count": 219, "metadata": {}, "outputs": [], "source": [ "# def normalize_df(df):\n", "# df[\"total\"] = df.sum(axis = 1)\n", "# df = df.apply(lambda row: row/row[\"total\"], axis = 1)\n", "# df.drop(\"total\", axis=1, inplace = True)\n", "# return(df)\n", "\n", "def normalize_df(df):\n", " names = df.columns\n", " df[\"total\"] = df.sum(axis = 1)\n", " for name in names:\n", " df[name] = df[name]/df[\"total\"]\n", " df.drop(\"total\", axis =1 , inplace = True)\n", " return(df)" ] }, { "cell_type": "code", "execution_count": 222, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNNNSVBPJJCCVBZDTRBVBTO...VBGEXJJRPDTRPWPCDRBRMDRBS
PoN
NNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
N0.2075470.0566040.0566040.1698110.0566040.0377360.0754720.0754720.0754720.056604...0.0000000.00.00.00.0000000.00.0000000.00.0000000.000000
N0.2761900.0095240.0095240.0666670.0476190.0095240.1333330.0761900.0380950.038095...0.0095240.00.00.00.0000000.00.0000000.00.0000000.000000
NNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
N0.2888890.0444440.0444440.1111110.0222220.0444440.1111110.0000000.0000000.022222...0.0000000.00.00.00.0000000.00.0000000.00.0000000.000000
..................................................................
P0.1627910.0465120.0232560.1395350.0465120.0000000.0465120.0697670.0930230.046512...0.0232560.00.00.00.0000000.00.0000000.00.0697670.000000
P0.2083330.0416670.0000000.0416670.0000000.0000000.0833330.0000000.0416670.041667...0.0416670.00.00.00.0000000.00.0416670.00.0000000.000000
P0.1919190.0101010.0000000.0707070.0707070.0101010.1414140.1010100.0707070.040404...0.0101010.00.00.00.0000000.00.0101010.00.0101010.020202
P0.2064520.0451610.0193550.0903230.0451610.0000000.0967740.0709680.0451610.045161...0.0129030.00.00.00.0258060.00.0000000.00.0000000.000000
P0.2325580.0232560.0465120.1395350.0465120.0000000.0930230.0697670.0465120.046512...0.0000000.00.00.00.0232560.00.0000000.00.0000000.000000
\n", "

92 rows × 28 columns

\n", "
" ], "text/plain": [ " NN NNS VBP JJ CC VBZ DT \\\n", "PoN \n", "N NaN NaN NaN NaN NaN NaN NaN \n", "N 0.207547 0.056604 0.056604 0.169811 0.056604 0.037736 0.075472 \n", "N 0.276190 0.009524 0.009524 0.066667 0.047619 0.009524 0.133333 \n", "N NaN NaN NaN NaN NaN NaN NaN \n", "N 0.288889 0.044444 0.044444 0.111111 0.022222 0.044444 0.111111 \n", ".. ... ... ... ... ... ... ... \n", "P 0.162791 0.046512 0.023256 0.139535 0.046512 0.000000 0.046512 \n", "P 0.208333 0.041667 0.000000 0.041667 0.000000 0.000000 0.083333 \n", "P 0.191919 0.010101 0.000000 0.070707 0.070707 0.010101 0.141414 \n", "P 0.206452 0.045161 0.019355 0.090323 0.045161 0.000000 0.096774 \n", "P 0.232558 0.023256 0.046512 0.139535 0.046512 0.000000 0.093023 \n", "\n", " RB VB TO ... VBG EX JJR PDT RP \\\n", "PoN ... \n", "N NaN NaN NaN ... NaN NaN NaN NaN NaN \n", "N 0.075472 0.075472 0.056604 ... 0.000000 0.0 0.0 0.0 0.000000 \n", "N 0.076190 0.038095 0.038095 ... 0.009524 0.0 0.0 0.0 0.000000 \n", "N NaN NaN NaN ... NaN NaN NaN NaN NaN \n", "N 0.000000 0.000000 0.022222 ... 0.000000 0.0 0.0 0.0 0.000000 \n", ".. ... ... ... ... ... ... ... ... ... \n", "P 0.069767 0.093023 0.046512 ... 0.023256 0.0 0.0 0.0 0.000000 \n", "P 0.000000 0.041667 0.041667 ... 0.041667 0.0 0.0 0.0 0.000000 \n", "P 0.101010 0.070707 0.040404 ... 0.010101 0.0 0.0 0.0 0.000000 \n", "P 0.070968 0.045161 0.045161 ... 0.012903 0.0 0.0 0.0 0.025806 \n", "P 0.069767 0.046512 0.046512 ... 0.000000 0.0 0.0 0.0 0.023256 \n", "\n", " WP CD RBR MD RBS \n", "PoN \n", "N NaN NaN NaN NaN NaN \n", "N 0.0 0.000000 0.0 0.000000 0.000000 \n", "N 0.0 0.000000 0.0 0.000000 0.000000 \n", "N NaN NaN NaN NaN NaN \n", "N 0.0 0.000000 0.0 0.000000 0.000000 \n", ".. ... ... ... ... ... \n", "P 0.0 0.000000 0.0 0.069767 0.000000 \n", "P 0.0 0.041667 0.0 0.000000 0.000000 \n", "P 0.0 0.010101 0.0 0.010101 0.020202 \n", "P 0.0 0.000000 0.0 0.000000 0.000000 \n", "P 0.0 0.000000 0.0 0.000000 0.000000 \n", "\n", "[92 rows x 28 columns]" ] }, "execution_count": 222, "metadata": {}, "output_type": "execute_result" } ], "source": [ "norm_df = normalize_df(new_df)\n", "\n", "# new_df['total'] = new_df.sum(axis = 1)\n", "# new_df_norm = new_df.copy()\n", "# new_df_norm = new_df_norm.apply(lambda x: x/x['total'], axis=1)\n", "\n", "# new_df_norm = new_df_norm.drop('total', axis=1)\n", "# norm_df = norm_df.fillna(0).astype(int)\n", "# get_NB(new_df_norm, new_df_norm.index)\n", "# new_df_norm[:5]\n", "norm_df\n", "# new_df" ] }, { "cell_type": "code", "execution_count": 197, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5\n" ] } ], "source": [ "new_df = pd.DataFrame(all_df['bow_no_sw'].tolist(), all_df['PoN'])\n", "new_df = new_df.fillna(0).astype(int)\n", "new_df[:5]\n", "get_NB(new_df, new_df.index)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }