{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW4 [Deception] " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 1: GET THAT DATA" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import os\n", "def get_data(file, path):\n", " f=open(path+file)\n", " data = f.read()\n", " f.close()\n", " return data\n", " \n", "def get_data_from_files(path):\n", " results = [get_data(file, path) for file in os.listdir(path)]\n", " return results\n", "\n", "# pos = get_data_from_files('../pos_cornell//')\n", "# neg = get_data_from_files('../neg_cornell/')\n", "\n", "# pos = get_data_from_files('../hw4_lie_false/')\n", "# neg = get_data_from_files('../hw4_lie_true/')\n", "\n", "pos = get_data_from_files('../hw4_lie_false/')\n", "neg = get_data_from_files('../hw4_lie_true/')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoN
0?N
1Twin Trees Cicero NY HUGE salad bar and high q...N
2The worst restaurant that I have ever eaten in...N
3?N
4I have been to a Asian restaurant in New York ...N
\n", "
" ], "text/plain": [ " 0 PoN\n", "0 ? N\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N\n", "2 The worst restaurant that I have ever eaten in... N\n", "3 ? N\n", "4 I have been to a Asian restaurant in New York ... N" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "neg_df = pd.DataFrame(neg)\n", "pos_df = pd.DataFrame(pos)\n", "pos_df['PoN'] = 'P'\n", "neg_df['PoN'] = 'N'\n", "all_df = neg_df.append(pos_df)\n", "all_df.reset_index(drop=True,inplace=True)\n", "all_df[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 2: TOKENIZE" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize, sent_tokenize\n", "from nltk.sentiment import SentimentAnalyzer\n", "from nltk.sentiment.util import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 2a by sentence" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def get_sentence_tokens(review):\n", " return sent_tokenize(review)\n", " \n", "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n", "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 2b by word" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n", "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokens
0?N[?]1[]0
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 ? N \n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "\n", " sentences num_sentences \\\n", "0 [?] 1 \n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "\n", " tokens num_tokens \n", "0 [] 0 \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 2c Remove if tokens < 1" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokens
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "4 I have been to a Asian restaurant in New York ... N \n", "\n", " sentences num_sentences \\\n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", "\n", " tokens num_tokens \n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 " ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df = all_df.drop(all_df[all_df.num_tokens < 1].index)\n", "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 3: EXPERIMENT\n", "#### Experiment with: stopwords, stemming, lemming etc." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 3a remove english stopwords" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words(\"english\"))\n", "def remove_stopwords(sentence):\n", " filtered_text = []\n", " for word in sentence:\n", " if word not in stop_words:\n", " filtered_text.append(word)\n", " return filtered_text\n", "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n", "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_sw
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45[asian, restaurant, new, york, city, menu, wri...23
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "4 I have been to a Asian restaurant in New York ... N \n", "\n", " sentences num_sentences \\\n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", "\n", " tokens num_tokens \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", "\n", " no_sw num_no_sw \n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "4 [asian, restaurant, new, york, city, menu, wri... 23 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 3b get stems for both tokens and no_sw" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "from nltk.stem import PorterStemmer\n", "def get_stems(sentence):\n", " ps = PorterStemmer()\n", " return [ps.stem(w) for w in sentence]\n", " \n", "all_df['stemmed'] = all_df.apply(lambda x: get_stems(x['tokens']),axis=1)\n", "all_df['stemmed_no_sw'] = all_df.apply(lambda x: get_stems(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_sw
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla...
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45[asian, restaurant, new, york, city, menu, wri...23[i, have, been, to, a, asian, restaur, in, new...[asian, restaur, new, york, citi, menu, writte...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "4 I have been to a Asian restaurant in New York ... N \n", "\n", " sentences num_sentences \\\n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", "\n", " tokens num_tokens \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", "\n", " no_sw num_no_sw \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "4 [asian, restaurant, new, york, city, menu, wri... 23 \n", "\n", " stemmed \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "4 [i, have, been, to, a, asian, restaur, in, new... \n", "\n", " stemmed_no_sw \n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... \n", "4 [asian, restaur, new, york, citi, menu, writte... " ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 3c get lemmas for both tokens and no_sw" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from nltk.stem.wordnet import WordNetLemmatizer\n", "def get_lemmas(sentence):\n", " lem = WordNetLemmatizer() \n", " return [lem.lemmatize(w) for w in sentence]\n", " \n", "all_df['lemmed'] = all_df.apply(lambda x: get_lemmas(x['tokens']),axis=1)\n", "all_df['lemmed_no_sw'] = all_df.apply(lambda x: get_lemmas(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_swlemmedlemmed_no_sw
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla...[the, worst, restaurant, that, i, have, ever, ...[worst, restaurant, ever, eaten, undoubtedly, ...
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45[asian, restaurant, new, york, city, menu, wri...23[i, have, been, to, a, asian, restaur, in, new...[asian, restaur, new, york, citi, menu, writte...[i, have, been, to, a, asian, restaurant, in, ...[asian, restaurant, new, york, city, menu, wri...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "4 I have been to a Asian restaurant in New York ... N \n", "\n", " sentences num_sentences \\\n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", "\n", " tokens num_tokens \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", "\n", " no_sw num_no_sw \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "4 [asian, restaurant, new, york, city, menu, wri... 23 \n", "\n", " stemmed \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "4 [i, have, been, to, a, asian, restaur, in, new... \n", "\n", " stemmed_no_sw \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... \n", "4 [asian, restaur, new, york, citi, menu, writte... \n", "\n", " lemmed \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaurant, that, i, have, ever, ... \n", "4 [i, have, been, to, a, asian, restaurant, in, ... \n", "\n", " lemmed_no_sw \n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... \n", "4 [asian, restaurant, new, york, city, menu, wri... " ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "all_df['pos'] = all_df.apply(lambda x: nltk.pos_tag(x['tokens']),axis=1)\n", "all_df['pos_no_sw'] = all_df.apply(lambda x: nltk.pos_tag(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_swlemmedlemmed_no_swpospos_no_swpos_dictpos_dict_no_sw
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...{'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ...{'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla...[the, worst, restaurant, that, i, have, ever, ...[worst, restaurant, ever, eaten, undoubtedly, ...[(the, DT), (worst, JJS), (restaurant, NN), (t...[(worst, RBS), (restaurant, NN), (ever, RB), (...{'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':...{'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45[asian, restaurant, new, york, city, menu, wri...23[i, have, been, to, a, asian, restaur, in, new...[asian, restaur, new, york, citi, menu, writte...[i, have, been, to, a, asian, restaurant, in, ...[asian, restaurant, new, york, city, menu, wri...[(i, NNS), (have, VBP), (been, VBN), (to, TO),...[(asian, JJ), (restaurant, NN), (new, JJ), (yo...{'NNS': 2, 'VBP': 2, 'VBN': 5, 'TO': 1, 'DT': ...{'JJ': 5, 'NN': 11, 'VBN': 2, 'VBP': 1, 'NNS':...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "4 I have been to a Asian restaurant in New York ... N \n", "\n", " sentences num_sentences \\\n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", "\n", " tokens num_tokens \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", "\n", " no_sw num_no_sw \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "4 [asian, restaurant, new, york, city, menu, wri... 23 \n", "\n", " stemmed \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "4 [i, have, been, to, a, asian, restaur, in, new... \n", "\n", " stemmed_no_sw \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... \n", "4 [asian, restaur, new, york, citi, menu, writte... \n", "\n", " lemmed \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaurant, that, i, have, ever, ... \n", "4 [i, have, been, to, a, asian, restaurant, in, ... \n", "\n", " lemmed_no_sw \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... \n", "4 [asian, restaurant, new, york, city, menu, wri... \n", "\n", " pos \\\n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(the, DT), (worst, JJS), (restaurant, NN), (t... \n", "4 [(i, NNS), (have, VBP), (been, VBN), (to, TO),... \n", "\n", " pos_no_sw \\\n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(worst, RBS), (restaurant, NN), (ever, RB), (... \n", "4 [(asian, JJ), (restaurant, NN), (new, JJ), (yo... \n", "\n", " pos_dict \\\n", "1 {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... \n", "2 {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... \n", "4 {'NNS': 2, 'VBP': 2, 'VBN': 5, 'TO': 1, 'DT': ... \n", "\n", " pos_dict_no_sw \n", "1 {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... \n", "2 {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... \n", "4 {'JJ': 5, 'NN': 11, 'VBN': 2, 'VBP': 1, 'NNS':... " ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_pos_dict(pos_tuple):\n", " pos_dict = {}\n", " for t in pos_tuple:\n", " if t[1] in pos_dict.keys():\n", " pos_dict[t[1]] += 1\n", " else:\n", " pos_dict.update({t[1]: 1})\n", " return pos_dict\n", "\n", "all_df['pos_dict'] = all_df.apply(lambda x: get_pos_dict(x['pos']), axis=1)\n", "all_df['pos_dict_no_sw'] = all_df.apply(lambda x: get_pos_dict(x['pos_no_sw']), axis=1)\n", "all_df[:3]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_swlemmedlemmed_no_swpospos_no_swpos_dictpos_dict_no_swbowbow_no_sw
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig...[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...{'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ...{'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla...[the, worst, restaurant, that, i, have, ever, ...[worst, restaurant, ever, eaten, undoubtedly, ...[(the, DT), (worst, JJS), (restaurant, NN), (t...[(worst, RBS), (restaurant, NN), (ever, RB), (...{'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':...{'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...{'the': 6, 'worst': 1, 'restaurant': 1, 'that'...{'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45[asian, restaurant, new, york, city, menu, wri...23[i, have, been, to, a, asian, restaur, in, new...[asian, restaur, new, york, citi, menu, writte...[i, have, been, to, a, asian, restaurant, in, ...[asian, restaurant, new, york, city, menu, wri...[(i, NNS), (have, VBP), (been, VBN), (to, TO),...[(asian, JJ), (restaurant, NN), (new, JJ), (yo...{'NNS': 2, 'VBP': 2, 'VBN': 5, 'TO': 1, 'DT': ...{'JJ': 5, 'NN': 11, 'VBN': 2, 'VBP': 1, 'NNS':...{'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3...{'asian': 1, 'restaurant': 1, 'new': 1, 'york'...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "4 I have been to a Asian restaurant in New York ... N \n", "\n", " sentences num_sentences \\\n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", "\n", " tokens num_tokens \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", "\n", " no_sw num_no_sw \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "4 [asian, restaurant, new, york, city, menu, wri... 23 \n", "\n", " stemmed \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "4 [i, have, been, to, a, asian, restaur, in, new... \n", "\n", " stemmed_no_sw \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... \n", "4 [asian, restaur, new, york, citi, menu, writte... \n", "\n", " lemmed \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaurant, that, i, have, ever, ... \n", "4 [i, have, been, to, a, asian, restaurant, in, ... \n", "\n", " lemmed_no_sw \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... \n", "4 [asian, restaurant, new, york, city, menu, wri... \n", "\n", " pos \\\n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(the, DT), (worst, JJS), (restaurant, NN), (t... \n", "4 [(i, NNS), (have, VBP), (been, VBN), (to, TO),... \n", "\n", " pos_no_sw \\\n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(worst, RBS), (restaurant, NN), (ever, RB), (... \n", "4 [(asian, JJ), (restaurant, NN), (new, JJ), (yo... \n", "\n", " pos_dict \\\n", "1 {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... \n", "2 {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... \n", "4 {'NNS': 2, 'VBP': 2, 'VBN': 5, 'TO': 1, 'DT': ... \n", "\n", " pos_dict_no_sw \\\n", "1 {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... \n", "2 {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... \n", "4 {'JJ': 5, 'NN': 11, 'VBN': 2, 'VBP': 1, 'NNS':... \n", "\n", " bow \\\n", "1 {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... \n", "2 {'the': 6, 'worst': 1, 'restaurant': 1, 'that'... \n", "4 {'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3... \n", "\n", " bow_no_sw \n", "1 {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... \n", "2 {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... \n", "4 {'asian': 1, 'restaurant': 1, 'new': 1, 'york'... " ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# def get_bow_from_tokens(df, column):\n", "# all_column_data = ' '.join(df[column].tolist())\n", "# all_column_fd = Counter(all_column_data.split())\n", "# return all_column_fd\n", "\n", "# # bow = get_bow_from_column(all_df, 'diy_cleaner')\n", "# # bow =\n", "from collections import Counter\n", "all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n", "all_df['bow_no_sw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)\n", "all_df[:3]" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "all_df_n = all_df[all_df['PoN'] == 'N']\n", "all_df_p = all_df[all_df['PoN'] == 'P']\n", "\n", "big_bow = [item for review in all_df['bow'].tolist() for item in review]\n", "big_bow_n = [item for review in all_df_n['bow'].tolist() for item in review]\n", "big_bow_p = [item for review in all_df_p['bow'].tolist() for item in review]\n", "\n", "df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()\n", "df = df.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()\n", "df_n = df_n.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()\n", "df_p = df_p.rename(columns={'index':'word', 0:'count'})" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt \n", "def bar_plot(df, title): \n", " graph = sns.barplot(y = \"count\", x = \"word\", data = df, palette = \"husl\")\n", " plt.title(title)\n", " plt.xlabel(\"Word\")\n", " plt.ylabel(\"Count\")\n", " sns.set_context(\"talk\")\n", " plt.xticks(rotation = 90)\n", " return plt\n", "\n", "print(bar_plot(df.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (ALL) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_n.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (TRUE) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (FALSE) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "all_df_n = all_df[all_df['PoN'] == 'N']\n", "all_df_p = all_df[all_df['PoN'] == 'P']\n", "\n", "big_bow = [item for review in all_df['bow_no_sw'].tolist() for item in review]\n", "big_bow_n = [item for review in all_df_n['bow_no_sw'].tolist() for item in review]\n", "big_bow_p = [item for review in all_df_p['bow_no_sw'].tolist() for item in review]\n", "\n", "df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()\n", "df = df.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()\n", "df_n = df_n.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()\n", "df_p = df_p.rename(columns={'index':'word', 0:'count'})" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (ALL) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_n.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (TRUE) Stopwords Removed\"))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (FALSE) Stopwords Removed\"))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "all_df_n = all_df[all_df['PoN'] == 'N']\n", "all_df_p = all_df[all_df['PoN'] == 'P']\n", "\n", "big_bow = [item for review in all_df['pos_dict'].tolist() for item in review]\n", "big_bow_n = [item for review in all_df_n['pos_dict'].tolist() for item in review]\n", "big_bow_p = [item for review in all_df_p['pos_dict'].tolist() for item in review]\n", "\n", "df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()\n", "df = df.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()\n", "df_n = df_n.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()\n", "df_p = df_p.rename(columns={'index':'word', 0:'count'})" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 Items (ALL) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_n.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS (TRUE) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS (FALSE) Prior to Cleaning\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 4: TEST EXPERIMENTS!!" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import GaussianNB\n", "\n", "def get_NB(small_df, labels):\n", " x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n", "\n", " gnb = GaussianNB()\n", " gnb.fit(x_train, y_train)\n", " y_pred = gnb.predict(x_test)\n", " from sklearn import metrics\n", " print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 1: Parts of speech frequency distribution" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNNNSVBPJJCCVBZDTRBVBTO...VBGEXJJRPDTRPWPCDRBRMDRBS
PoN
N11.03.03.09.03.02.04.04.04.03.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
N29.01.01.07.05.01.014.08.04.04.0...1.0NaNNaNNaNNaNNaNNaNNaNNaNNaN
N13.02.02.05.01.02.05.0NaNNaN1.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

3 rows × 28 columns

\n", "
" ], "text/plain": [ " NN NNS VBP JJ CC VBZ DT RB VB TO ... VBG EX JJR \\\n", "PoN ... \n", "N 11.0 3.0 3.0 9.0 3.0 2.0 4.0 4.0 4.0 3.0 ... NaN NaN NaN \n", "N 29.0 1.0 1.0 7.0 5.0 1.0 14.0 8.0 4.0 4.0 ... 1.0 NaN NaN \n", "N 13.0 2.0 2.0 5.0 1.0 2.0 5.0 NaN NaN 1.0 ... NaN NaN NaN \n", "\n", " PDT RP WP CD RBR MD RBS \n", "PoN \n", "N NaN NaN NaN NaN NaN NaN NaN \n", "N NaN NaN NaN NaN NaN NaN NaN \n", "N NaN NaN NaN NaN NaN NaN NaN \n", "\n", "[3 rows x 28 columns]" ] }, "execution_count": 31, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos_df = pd.DataFrame(all_df['pos_dict'].tolist(), all_df['PoN'])\n", "pos_df[:3]" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNNNSVBPJJCCVBZDTRBVBTO...VBGEXJJRPDTRPWPCDRBRMDRBS
PoN
N11339324443...0000000000
N291175114844...1000000000
N13225125001...0000000000
\n", "

3 rows × 28 columns

\n", "
" ], "text/plain": [ " NN NNS VBP JJ CC VBZ DT RB VB TO ... VBG EX JJR PDT RP \\\n", "PoN ... \n", "N 11 3 3 9 3 2 4 4 4 3 ... 0 0 0 0 0 \n", "N 29 1 1 7 5 1 14 8 4 4 ... 1 0 0 0 0 \n", "N 13 2 2 5 1 2 5 0 0 1 ... 0 0 0 0 0 \n", "\n", " WP CD RBR MD RBS \n", "PoN \n", "N 0 0 0 0 0 \n", "N 0 0 0 0 0 \n", "N 0 0 0 0 0 \n", "\n", "[3 rows x 28 columns]" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos_df = pos_df.fillna(0).astype(int)\n", "pos_df[:3]" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5925925925925926\n" ] } ], "source": [ "get_NB(pos_df, pos_df.index)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 1b: Normalized parts of speech frequency distribution" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNNNSVBPJJCCVBZDTRBVBTO...EXJJRPDTRPWPCDRBRMDRBStotal
PoN
N11339324443...00000000053
N291175114844...000000000105
N13225125001...00000000045
\n", "

3 rows × 29 columns

\n", "
" ], "text/plain": [ " NN NNS VBP JJ CC VBZ DT RB VB TO ... EX JJR PDT RP WP CD \\\n", "PoN ... \n", "N 11 3 3 9 3 2 4 4 4 3 ... 0 0 0 0 0 0 \n", "N 29 1 1 7 5 1 14 8 4 4 ... 0 0 0 0 0 0 \n", "N 13 2 2 5 1 2 5 0 0 1 ... 0 0 0 0 0 0 \n", "\n", " RBR MD RBS total \n", "PoN \n", "N 0 0 0 53 \n", "N 0 0 0 105 \n", "N 0 0 0 45 \n", "\n", "[3 rows x 29 columns]" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos_df_norm = pos_df.copy()\n", "pos_df_norm = pos_df_norm.apply(lambda x: x/x.sum(), axis=1)\n", "pos_df_norm[:3]\n", "pos_df_norm[1:]\n", "test = pos_df.copy()\n", "test['total'] = test.sum(axis = 1)\n", "test[:3]" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNNNSVBPJJCCVBZDTRBVBTO...VBGEXJJRPDTRPWPCDRBRMDRBS
PoN
N0.2075470.0566040.0566040.1698110.0566040.0377360.0754720.0754720.0754720.056604...0.0000000.00.00.00.00.00.00.00.00.0
N0.2761900.0095240.0095240.0666670.0476190.0095240.1333330.0761900.0380950.038095...0.0095240.00.00.00.00.00.00.00.00.0
N0.2888890.0444440.0444440.1111110.0222220.0444440.1111110.0000000.0000000.022222...0.0000000.00.00.00.00.00.00.00.00.0
\n", "

3 rows × 28 columns

\n", "
" ], "text/plain": [ " NN NNS VBP JJ CC VBZ DT \\\n", "PoN \n", "N 0.207547 0.056604 0.056604 0.169811 0.056604 0.037736 0.075472 \n", "N 0.276190 0.009524 0.009524 0.066667 0.047619 0.009524 0.133333 \n", "N 0.288889 0.044444 0.044444 0.111111 0.022222 0.044444 0.111111 \n", "\n", " RB VB TO ... VBG EX JJR PDT RP WP \\\n", "PoN ... \n", "N 0.075472 0.075472 0.056604 ... 0.000000 0.0 0.0 0.0 0.0 0.0 \n", "N 0.076190 0.038095 0.038095 ... 0.009524 0.0 0.0 0.0 0.0 0.0 \n", "N 0.000000 0.000000 0.022222 ... 0.000000 0.0 0.0 0.0 0.0 0.0 \n", "\n", " CD RBR MD RBS \n", "PoN \n", "N 0.0 0.0 0.0 0.0 \n", "N 0.0 0.0 0.0 0.0 \n", "N 0.0 0.0 0.0 0.0 \n", "\n", "[3 rows x 28 columns]" ] }, "execution_count": 35, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos_df_norm[:3]" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5925925925925926\n" ] } ], "source": [ "get_NB(pos_df_norm, pos_df.index)" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.4444444444444444\n" ] } ], "source": [ "# small_df\n", "small_df = pos_df_norm.filter(['PRP', 'PRP$','NN'])\n", "get_NB(small_df, pos_df.index)" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "337\n", "160\n", "177\n", "0.47477744807121663\n", "0.5252225519287834\n" ] } ], "source": [ "pos_df_n = pos_df[pos_df.index == 'N']\n", "pos_df_p = pos_df[pos_df.index == 'P']\n", "print(pos_df['PRP'].sum())\n", "print(pos_df_n['PRP'].sum())\n", "print(pos_df_p['PRP'].sum())\n", "print(pos_df_n['PRP'].sum()/pos_df['PRP'].sum())\n", "print(pos_df_p['PRP'].sum()/pos_df['PRP'].sum())" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "138\n", "65\n", "73\n" ] } ], "source": [ "pos_df_n = pos_df[pos_df.index == 'N']\n", "pos_df_p = pos_df[pos_df.index == 'P']\n", "print(pos_df['PRP$'].sum())\n", "print(pos_df_n['PRP$'].sum())\n", "print(pos_df_p['PRP$'].sum())" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.256356712105416\n", "2.060598739935355\n", "2.19575797217006\n" ] } ], "source": [ "pos_df_n = pos_df_norm[pos_df_norm.index == 'N']\n", "pos_df_p = pos_df_norm[pos_df_norm.index == 'P']\n", "print(pos_df_norm['PRP'].sum())\n", "print(pos_df_n['PRP'].sum())\n", "print(pos_df_p['PRP'].sum())" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.04729285235672684\n", "0.04683178954398534\n", "0.047733868960218695\n" ] } ], "source": [ "pos_df_n = pos_df_norm[pos_df_norm.index == 'N']\n", "pos_df_p = pos_df_norm[pos_df_norm.index == 'P']\n", "print(pos_df_norm['PRP'].mean())\n", "print(pos_df_n['PRP'].mean())\n", "print(pos_df_p['PRP'].mean())" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.0177106769174579\n", "0.017530735194787515\n", "0.0178827950869687\n" ] } ], "source": [ "pos_df_n = pos_df_norm[pos_df_norm.index == 'N']\n", "pos_df_p = pos_df_norm[pos_df_norm.index == 'P']\n", "print(pos_df_norm['PRP$'].mean())\n", "print(pos_df_n['PRP$'].mean())\n", "print(pos_df_p['PRP$'].mean())" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,...\n", "2 [(the, DT), (worst, JJS), (restaurant, NN), (t...\n", "4 [(i, NNS), (have, VBP), (been, VBN), (to, TO),...\n", "5 [(the, DT), (best, JJS), (restaurant, NN), (i,...\n", "6 [(the, DT), (restaurant, NN), (looked, VBD), (...\n", " ... \n", "87 [(mikes, NNS), (pizza, VBP), (high, JJ), (poin...\n", "88 [(after, IN), (i, JJ), (went, VBD), (shopping,...\n", "89 [(i, NN), (entered, VBD), (the, DT), (restaura...\n", "90 [(carlos, NN), (plate, NN), (shack, NN), (was,...\n", "91 [(olive, JJ), (oil, NN), (garden, NN), (was, V...\n", "Name: pos, Length: 90, dtype: object" ] }, "execution_count": 43, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df['pos']" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "all_df['pos_sent'] = all_df.apply(lambda x: [word[1] for word in x['pos']], axis=1)\n", "all_df['pos_sent_str'] = all_df.apply(lambda x: [' '.join(x['pos_sent'])], axis=1)\n", "all_df['pos_no_sw_sent'] = all_df.apply(lambda x: [word[1] for word in x['pos_no_sw']], axis=1)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "list" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(all_df['pos_sent_str'][1])" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "all_df['pos_sent_bi'] = all_df.apply(lambda x: [b for l in x['pos_sent_str'] for b in zip(l.split(\" \")[:-1], l.split(\" \")[1:])], axis=1)\n", "# bigrams = [b for l in text for b in zip(l.split(\" \")[:-1], l.split(\" \")[1:])]" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "# all_df['pos_sent_tri'] = all_df.apply(lambda x: [b for l in x['pos_sent_str'] for b in zip(l.split(\" \")[:-1], l.split(\" \")[1:])], axis=1)\n" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_sw...pospos_no_swpos_dictpos_dict_no_swbowbow_no_swpos_sentpos_sent_strpos_no_sw_sentpos_sent_bi
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig......[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...{'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ...{'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...[NN, NNS, VBP, JJ, JJ, NN, NN, CC, JJ, NN, JJ,...[NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN ...[NN, NNS, VBP, JJ, JJ, NN, NN, JJ, NN, JJ, NNS...[(NN, NNS), (NNS, VBP), (VBP, JJ), (JJ, JJ), (...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla......[(the, DT), (worst, JJS), (restaurant, NN), (t...[(worst, RBS), (restaurant, NN), (ever, RB), (...{'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':...{'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...{'the': 6, 'worst': 1, 'restaurant': 1, 'that'...{'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...[DT, JJS, NN, IN, NN, VBP, RB, VBN, IN, VBZ, R...[DT JJS NN IN NN VBP RB VBN IN VBZ RB DT NN VB...[RBS, NN, RB, RB, JJ, NN, VBN, NN, NN, VBD, NN...[(DT, JJS), (JJS, NN), (NN, IN), (IN, NN), (NN...
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45[asian, restaurant, new, york, city, menu, wri...23[i, have, been, to, a, asian, restaur, in, new...[asian, restaur, new, york, citi, menu, writte......[(i, NNS), (have, VBP), (been, VBN), (to, TO),...[(asian, JJ), (restaurant, NN), (new, JJ), (yo...{'NNS': 2, 'VBP': 2, 'VBN': 5, 'TO': 1, 'DT': ...{'JJ': 5, 'NN': 11, 'VBN': 2, 'VBP': 1, 'NNS':...{'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3...{'asian': 1, 'restaurant': 1, 'new': 1, 'york'...[NNS, VBP, VBN, TO, DT, JJ, NN, IN, JJ, NN, NN...[NNS VBP VBN TO DT JJ NN IN JJ NN NN DT NN VBZ...[JJ, NN, JJ, NN, NN, NN, VBN, JJ, JJ, VBP, JJ,...[(NNS, VBP), (VBP, VBN), (VBN, TO), (TO, DT), ...
5The best restaurant I have gone to is when I w...N[The best restaurant I have gone to is when I ...6[the, best, restaurant, i, have, gone, to, is,...71[best, restaurant, gone, went, applebee, frien...30[the, best, restaur, i, have, gone, to, is, wh...[best, restaur, gone, went, applebe, friend, s......[(the, DT), (best, JJS), (restaurant, NN), (i,...[(best, RBS), (restaurant, NN), (gone, VBN), (...{'DT': 6, 'JJS': 1, 'NN': 10, 'VBP': 2, 'VBN':...{'RBS': 1, 'NN': 7, 'VBN': 1, 'VBD': 2, 'JJ': ...{'the': 5, 'best': 1, 'restaurant': 2, 'i': 4,...{'best': 1, 'restaurant': 2, 'gone': 1, 'went'...[DT, JJS, NN, NN, VBP, VBN, TO, VBZ, WRB, JJ, ...[DT JJS NN NN VBP VBN TO VBZ WRB JJ VBD TO VB ...[RBS, NN, VBN, VBD, JJ, NNS, NN, RB, NN, JJ, V...[(DT, JJS), (JJS, NN), (NN, NN), (NN, VBP), (V...
\n", "

4 rows × 22 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "4 I have been to a Asian restaurant in New York ... N \n", "5 The best restaurant I have gone to is when I w... N \n", "\n", " sentences num_sentences \\\n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", "5 [The best restaurant I have gone to is when I ... 6 \n", "\n", " tokens num_tokens \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", "5 [the, best, restaurant, i, have, gone, to, is,... 71 \n", "\n", " no_sw num_no_sw \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "4 [asian, restaurant, new, york, city, menu, wri... 23 \n", "5 [best, restaurant, gone, went, applebee, frien... 30 \n", "\n", " stemmed \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "4 [i, have, been, to, a, asian, restaur, in, new... \n", "5 [the, best, restaur, i, have, gone, to, is, wh... \n", "\n", " stemmed_no_sw ... \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... ... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... ... \n", "4 [asian, restaur, new, york, citi, menu, writte... ... \n", "5 [best, restaur, gone, went, applebe, friend, s... ... \n", "\n", " pos \\\n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(the, DT), (worst, JJS), (restaurant, NN), (t... \n", "4 [(i, NNS), (have, VBP), (been, VBN), (to, TO),... \n", "5 [(the, DT), (best, JJS), (restaurant, NN), (i,... \n", "\n", " pos_no_sw \\\n", "1 [(twin, NN), (trees, NNS), (cicero, VBP), (ny,... \n", "2 [(worst, RBS), (restaurant, NN), (ever, RB), (... \n", "4 [(asian, JJ), (restaurant, NN), (new, JJ), (yo... \n", "5 [(best, RBS), (restaurant, NN), (gone, VBN), (... \n", "\n", " pos_dict \\\n", "1 {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ... \n", "2 {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':... \n", "4 {'NNS': 2, 'VBP': 2, 'VBN': 5, 'TO': 1, 'DT': ... \n", "5 {'DT': 6, 'JJS': 1, 'NN': 10, 'VBP': 2, 'VBN':... \n", "\n", " pos_dict_no_sw \\\n", "1 {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ... \n", "2 {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ... \n", "4 {'JJ': 5, 'NN': 11, 'VBN': 2, 'VBP': 1, 'NNS':... \n", "5 {'RBS': 1, 'NN': 7, 'VBN': 1, 'VBD': 2, 'JJ': ... \n", "\n", " bow \\\n", "1 {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... \n", "2 {'the': 6, 'worst': 1, 'restaurant': 1, 'that'... \n", "4 {'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3... \n", "5 {'the': 5, 'best': 1, 'restaurant': 2, 'i': 4,... \n", "\n", " bow_no_sw \\\n", "1 {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... \n", "2 {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... \n", "4 {'asian': 1, 'restaurant': 1, 'new': 1, 'york'... \n", "5 {'best': 1, 'restaurant': 2, 'gone': 1, 'went'... \n", "\n", " pos_sent \\\n", "1 [NN, NNS, VBP, JJ, JJ, NN, NN, CC, JJ, NN, JJ,... \n", "2 [DT, JJS, NN, IN, NN, VBP, RB, VBN, IN, VBZ, R... \n", "4 [NNS, VBP, VBN, TO, DT, JJ, NN, IN, JJ, NN, NN... \n", "5 [DT, JJS, NN, NN, VBP, VBN, TO, VBZ, WRB, JJ, ... \n", "\n", " pos_sent_str \\\n", "1 [NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN ... \n", "2 [DT JJS NN IN NN VBP RB VBN IN VBZ RB DT NN VB... \n", "4 [NNS VBP VBN TO DT JJ NN IN JJ NN NN DT NN VBZ... \n", "5 [DT JJS NN NN VBP VBN TO VBZ WRB JJ VBD TO VB ... \n", "\n", " pos_no_sw_sent \\\n", "1 [NN, NNS, VBP, JJ, JJ, NN, NN, JJ, NN, JJ, NNS... \n", "2 [RBS, NN, RB, RB, JJ, NN, VBN, NN, NN, VBD, NN... \n", "4 [JJ, NN, JJ, NN, NN, NN, VBN, JJ, JJ, VBP, JJ,... \n", "5 [RBS, NN, VBN, VBD, JJ, NNS, NN, RB, NN, JJ, V... \n", "\n", " pos_sent_bi \n", "1 [(NN, NNS), (NNS, VBP), (VBP, JJ), (JJ, JJ), (... \n", "2 [(DT, JJS), (JJS, NN), (NN, IN), (IN, NN), (NN... \n", "4 [(NNS, VBP), (VBP, VBN), (VBN, TO), (TO, DT), ... \n", "5 [(DT, JJS), (JJS, NN), (NN, NN), (NN, VBP), (V... \n", "\n", "[4 rows x 22 columns]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:4]" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN VBZ RB JJ RB CC JJ VB JJ TO VB DT NN TO VB NN IN RB RB IN PRP VBP VBN IN DT JJ NNS DT NN VBP JJ IN PRP$ NNS CC NN TO VB NN NN']" ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test = all_df['pos_sent_str'][1]\n", "test" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('NN', 'NNS'), ('NNS', 'VBP'), ('VBP', 'JJ'), ('JJ', 'JJ'), ('JJ', 'NN'), ('NN', 'NN'), ('NN', 'CC'), ('CC', 'JJ'), ('JJ', 'NN'), ('NN', 'JJ'), ('JJ', 'VBZ'), ('VBZ', 'DT'), ('DT', 'NN'), ('NN', 'VBZ'), ('VBZ', 'RB'), ('RB', 'JJ'), ('JJ', 'RB'), ('RB', 'CC'), ('CC', 'JJ'), ('JJ', 'VB'), ('VB', 'JJ'), ('JJ', 'TO'), ('TO', 'VB'), ('VB', 'DT'), ('DT', 'NN'), ('NN', 'TO'), ('TO', 'VB'), ('VB', 'NN'), ('NN', 'IN'), ('IN', 'RB'), ('RB', 'RB'), ('RB', 'IN'), ('IN', 'PRP'), ('PRP', 'VBP'), ('VBP', 'VBN'), ('VBN', 'IN'), ('IN', 'DT'), ('DT', 'JJ'), ('JJ', 'NNS'), ('NNS', 'DT'), ('DT', 'NN'), ('NN', 'VBP'), ('VBP', 'JJ'), ('JJ', 'IN'), ('IN', 'PRP$'), ('PRP$', 'NNS'), ('NNS', 'CC'), ('CC', 'NN'), ('NN', 'TO'), ('TO', 'VB'), ('VB', 'NN'), ('NN', 'NN')]\n" ] } ], "source": [ "text = [\"this is a sentence\", \"so is this one\"]\n", "test2 = [\"NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN VBZ RB JJ RB CC JJ VB JJ TO VB DT NN TO VB NN IN RB RB IN\", \"PRP VBP VBN IN DT JJ NNS DT NN VBP JJ IN PRP$ NNS CC NN TO VB NN NN\"]\n", "test1 = ['NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN VBZ RB JJ RB CC JJ VB JJ TO VB DT NN TO VB NN IN RB RB IN PRP VBP VBN IN DT JJ NNS DT NN VBP JJ IN PRP$ NNS CC NN TO VB NN NN']\n", "bigrams = [b for l in test1 for b in zip(l.split(\" \")[:-1], l.split(\" \")[1:])]\n", "print(bigrams)" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('NN', 'NNS'), ('NNS', 'VBP'), ('VBP', 'JJ'), ('JJ', 'JJ'), ('JJ', 'NN')]" ] }, "execution_count": 51, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# all_bigrams = [bigram for bigram in all_df.pos_sent_bi.tolist()]\n", "# flat_list = [item for sublist in l for item in sublist]\n", "all_df_n = all_df[all_df['PoN'] == 'N']\n", "all_df_p = all_df[all_df['PoN'] == 'P']\n", "all_bigrams = [bigram for sublist in all_df.pos_sent_bi.tolist() for bigram in sublist]\n", "all_bigrams_n = [bigram for sublist in all_df_n.pos_sent_bi.tolist() for bigram in sublist]\n", "all_bigrams_p = [bigram for sublist in all_df_p.pos_sent_bi.tolist() for bigram in sublist]\n", "all_bigrams[:5]" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "count = Counter(all_bigrams)\n", "count_n = Counter(all_bigrams_n)\n", "count_p = Counter(all_bigrams_p)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['VBZ'], dtype='\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS Bigrams (ALL)\"))" ] }, { "cell_type": "code", "execution_count": 56, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_n.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS Bigrams (TRUE)\"))" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ8AAAGECAYAAADp8y3MAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAgAElEQVR4nO3dd7wcZdn/8c+XmkCASBGDKE0pgkpXihoELAgoSpFeReFBBaSIIqDwQ1FQRGwIUlSUpvAoD4JUqSJgQBBESijSSxLSIbl+f9z3JpvN7jm758zOTk6+79drX3vOzD0z19mzu9fMPXdRRGBmZlamBXodgJmZzX+cfMzMrHROPmZmVjonHzMzK52Tj5mZlc7Jx8zMSufkYzYPkLSOpJB0RK9jqRJJO0maJmmlXsdSBkkfkjRD0nq9jmWwnHzmYfnLqN3Hyj2K8ROSfiHp7vwlEZI27KP8MpJ+JulZSVMl3Sdpvw6Od2rD3z1D0kuS/ixp6xbbbCjpN5KeyjG+LOlaSbtJUotttpL0f5KezNs8L+lOST+Q9LY24jykyf9ooqR/57/hLe3+zfMrSYsCpwBnR8QTdcubvbb1jxFN9jVc0ri8/ot9HHOMpKfbiG1hSZ+TdLukF/J7+SlJf5F0XP37StKh/cR7V61sRNwE3ASc1v4rVU0L9ToAG5Q9G37/AHAgcBZwc8O6F0uJaG77AtsB/wT+Dby7VUFJiwE3AGsApwOPAJ8CzpG0dESc2sFxjwKeBRYG1iK9LldL2iEirqg75hHAd4HngfOB/wDLADsDvwF2kbRLREyt2+bIvM3DwNnAf4HlgbWBvYGrgafajPN7wH355yWBTYDDgU9JWqfuuA8Aw4HXO3gNhrq9gFWAH7RYX//a1pvaZNlngKWAR4H9gB8NNKicWP4IfBS4jpQgJwArARsAxwInA280bPod0v+50csNv58OXCFps4i4daBx9lxE+DFEHsA+QAD79DqWuphWBBbJP5+Q49uwRdmj8vp9G5ZfDUwGlm/jeKfmfazZsPxDefnf6pZ9qrYMGNlQXqQvtQB+Urd8ODAJeAhYrMnxhzXuq0Wch+R9f6zJurPyuq1L+P8s0ev3yCBivwe4tZPXto993QDcTUpoAazXotwY4Ol+9rVl3se5LdYvD6ju90Nz+a3ajHVh0snkr3r9PxjMw9Vu8xlJS+ZqncclTZf0jKRzJK3QUG7bfMm/k6QjJT2aq5celPS5do8XEU9HxPQ2i+8GvAL8qmH56aQv/c+0e9wm/ko6430HzDo7/Q7p7HPXiBjXEHcAR5Cu2A6UtFpe9VZgMeCOiJjceJCImNq4rwF4Jj/Pet3U4p6PpCUknZmrdiZLukXS5pIulTSxoexdku6XtIakKyS9WjuWpEVyddCteV/TJY2VdIakpRr2MysWSXvmfU7NVYa75jKr5WOMkzRe0i/zlW39flaV9KuGqsubJX22vxdI0juA9YD/a/9lbbmv1UgnJ+cDlwETgf0Hsct35ufrmq2MiOfz+2tAIuL1vO9PSZpna6/m2cCtc5KGkc7w1gcuBG4lVUl9AfiIpA0i4oWGzb5KqoY6G5gC7AGcJWnJiCis3jnHtg5wbUQ0VkfckZ83GsQhViRdlTyZf1+HVL13dUQ81myDiJgh6Vzg+6SrpNNI1WmvA1tJWrXVth1YUtKy+eclgPcDXyIlvdv62lDSAqTqnQ8BF5PuBawOXEnrar+lc7m/AMfk3yFV+X2Z9OV7Gel/vQlwMPB+SZtExIyGfe0CjAJ+BownvY9+I2kG6YThT6T3z2ak6tcJpLN8JA0nfYGOBH5KqmJdGlg3l/9dX397/psB7uyjTP1rWzMpIqY0LNuPdBJyYURMknQJsJukI6KuurUDj+bnXSVdERGvtblds3gBJjc50bmd9PpvyOzPx7yl15defhT3oJ9qN+CwvP64huW75OU/rVu2bV72MvDmuuXDSV+MU4BlO4zvBFpUu5HqwwM4p8W2k4Fr2jhGrdrt/cCypC/HLUhfUgF8PZfbNf9+cj/7+2Aud17dsuPzstdJXwLfBz4LLNfBa1GrGmr2uA5YpqH8OnndEXXLds7LftBQtrZ8YsPyu/LyrzaJZ0FgWJPlX87bbNMklnHAW+qWv430JT4TOLBhP9fk/+FC+fdN8z4OHuB7/bS8/aodvrbHNvm7nwYur1tWq6Ldtcm+26l2WwC4Me9jAnAV8E3g4y1e40P7iDeAk5psU/t87j+Q168KD1/5zF92IFU9zXHFEhEXSToxrz+oYZtzo+5qKCKmSDqDdF/i48xdRTZQtSqZaS3WT60r047bG36fSLrJ++38+5L5eXw/+5mQn2dVPUXENyXdT2rEsDkp0QG8IekXwGER0ervaHQMKSkAjAA2Jn3h/6+kj0bExJZbpoYckJJfvUtIN9uXabLNG8APGxdGuqqZASBpQdJV2ELMrjp6H3NXcV0UEc/V7eMpSU8AbyFdKde7GdiadAU6ltmv+1aSLoqIxpvq/VkuP7/SR5n617bmkYbfP0qqSq1v4fZX4DHSFdFvO4yLiJgp6ePA/5BOcrYGPpZXvyrpmIj4eZNNjwL+0WT5402W1V6vN3caX1U4+cxfVgHGRsSkJuv+BXxS0mIx5yX+gy3KAqxaYGy1Yy7aYv2wujLt2I9U9TQDeBV4sCEhzJVUWmiapCLiMuCyXOe+DrAVKWkcRLoq/EqbcY6JiGvrfr9c0qOkL+9DgZP62HYVYEpEzFHFFhEh6WFStVmjp2LuaicAJO2Zj/ke5v5ueFOTTZpVOb4KTI+ImU2WQ0qIYyPiAUmnk16z7SXdQ0p0F0dEsy/gRrV7Jk2bwmeNr20z+5NOTB7K95Fqrga+IGmlqGvG3a78Gp8KnCppcVKV8XakasyfSXoqIhqT+T/aiLem9nfPs3PiuMGBVcXzpETx1sYVkt5Equ77bwf7uz0iro2IGyJiTJMrkfvz8/r97Ke2/p/NVkbEG3n/p5KuDqaQEt9gXJ2fPzzI/TTTNIFL2gu4gBT/IaRqna1JV8PQ/Lui8R5Qf8uhLllExGGk+25HkE4UDgbulnRcH9vX1LoOLN1nqb4CkZYjJYQRpBOq/9Q9Dsqx7jvQ/ddExKSIuDEivkJqTQeDf4/U/u5edaEYNF/5zF8eAzZucnUD8C7g+SbL12qyn3fV7a8QETE1V2VtJGnBmPPmdq1aq7EKZTDuJ/XT2VLSyhExtrFArn7al3QP44rG9Y0i4hlJTwGrS1q8xRVmOxbOz0v0U24ssJmkt9Vf/eSWfKt3eMw9SVd3W0RqTVXbV8sOwUWIiP+QGiecnq8QbgCOl/T9fqocaycP72T2Df5O7Ul6rb/M7BaG9Y4B9pH0rSZXcgNVaxww10lWh2pXaff3WarCfOUzf7mcVH11eP1CSTuRPsR/aLLNvpLeXFd2OKk11jTgzwXH91vSGd1eDcu/TLrnc1lRB4p01/ZrpBOwC5s0Jxapc+B7gJ9HxCN5+UhJzaqzkPRuYDXgyUEkHkgt6yD1O+nLH/Pz4Q3LdwLe3uExZ5CS7IK1Bbk13dc73E9b8us4x8lvfs0eJn0v9VcdemN+fn9fhfqxP+l/dUZEXNr4IF0JrkTqt9M2SWtLavX61/63/2qxvl3vJ1UX9vceqSxf+cxffkpqKn2ipNVJN+VrTa2fJrXiajQWuDPfSJ9COltcBzg6Ivq95M9nzrWbrR/Mz/tJqi07re4exJl5/z+VtCap+mMH0k3hr9bf3C5CRFwm6aukRggPSjqPdEN6GdIX+EakL/j6L/eRwG2SxpBacD1C+sJ+FylpinTjuF0frmteW2twsBfphvIp/Wx7Cemm9qGSRjG7qfV+pJ7yK3UQx6Wk1/laSReSqjl3pHvfEdsC35P0e1LCmUyqttwNuD4i+qxijYixSsPObENqRdkRSe8n/c9O76PY70kdjfcnNU2vWULSsS22uZB0r+1nkq4jNbT4L+ne4WbAp0n/22832XZLNR9W6fWIuKgu9oVJ9xgvj7m7Jcw7et3czo/iHrQxwgHpQ3Aa8ASpE+NzwC+BtzaUqzXl3Ak4klS1MY00RM7nO4ipr2avQUNzbVLz6LNI94Cmke61HNDB8ZqOcNDPNhuTrrqezq/JK6Sb37sDCzSUXRQ4gNSv5mHgtbzNU3nZZoN4XV7P+zkXWKWh/FxNrev+nz8h1f1PJvXd2ox0VfpCQ9m7gPv7iOmLpJEbppK+MM8EVsjHPbO/WPo6Rt3fu2H+fXXgF/n99BrpLP4B4BvAiDZfw/3yPtdocayWIxzkYwfwgX6O8bf8eiydfx/Tz/v5Y6QqtaNJCevJvP2U/Nr+CHh7wzH6a2rd2GT+k3l5W++1qj6U/xizOUjalnTWv1OkKgibh0h6DHglIrp6z6aXJC1Cao35l4j4Qq/jKYukG0g1x91okFIa3/Mxm4fle3CNy3YhNcP+y9xbDB2Rhm06mlSNu3JvoymHpNGkTrDtNuWvLN/zMZu3nSJpFVLHyImk4Vb2Jo3o3dj5dMjJV+XzzZV5RNzIELlocPIxm7fdQBqm/6uk+z8vkW56fyPaaBBi1iu+52NmZqXzlU8/JL1Busyd0F9ZMzObZUlgZkQ0zTM9u/KRtBGpafAWpP4IL5OGkD82coe+XO5GZg+fXu+iiJhj3g+laXW/Reor8ibgXtIoxk3n1WgzzpmAllqqvz5vZmZWM378eEit8preo+rllc/RpP4Il5Cmun0LqX3+PyRtHBH1A1o+ydw9rcc22ed5pAnHalMw7wNcJelDEdE4ynG7Jiy11FJLjRs32PnBzMzmHyNHjmT8+PEta4x6eeWzKXBX1M1yKemdpE6Fv4uIffKyG0nTEq/bz/42JnUIOywiTs/LhpHGPnomIj7Y1/Z97Heck4+ZWWdy8hkfESObre9Zk72IuC0apleONMjgAzQZzFLSQpJG9LHLHUk9xGfNIxJpFsJzgM3z8CNmZlYBlWovngdzXJ7UXLTeWsAk4DVJz0j6Wh70sN56wEMx90i4d5LG2+rzysnMzMpTtdZuu5PGRaq/v/MocD2pOm5J0syA/480am/9kBqjaD7fy7P5eYVmB5TUX32aWxqYmRWsMsknj2L8Y+AW6qZmjoj9G4qeL+li4EBJP4iIf+flw2k+BfPUuvVmZlYBlah2y8OIX0maanen6H/iptNIVWlb1C2bQvMpmIfVrZ9LRIzs60HD9MlmZjZ4Pb/yyZN4XUWq3tos2puzpTZrY/0Uus+Sqt4a1ZY1m6nQzMx6oKdXPrkp9B9Jc3tsW1eF1p9V83P92FVjgDWbtIh7X36+d8CBmplZoXp25SNpQeAi0qx/n4yIO5qUWRKYFhHTGrb7GmnK32vril8KHEGa6KvWz2dRYF/g1ogo7Mrn2W+dWdSu+jTquENKOY6ZWdl6We12GrA96cpnaUl71K2bGBGXA+sDF0r6LWnEghHAzqRh40+JiMdrG0TE3yRdAnw39+l5lDS0/EqkkQ7MzKwiepl8av1utsuPek8Al+fnW0lD5ixPutq5nzRN9PlN9rkXcGJ+fhNp2J5tIuLWwqM3M7MB61nyiYjRbZR5HNipg31OBY7MDzMzq6hKNLU2M7P5i5OPmZmVzsnHzMxK5+RjZmalc/IxM7PSOfmYmVnpnHzMzKx0Tj5mZlY6Jx8zMyudk4+ZmZXOycfMzErn5GNmZqVz8jEzs9I5+ZiZWemcfMzMrHROPmZmVjonHzMzK52Tj5mZlc7Jx8zMSufkY2ZmpXPyMTOz0jn5mJlZ6Zx8zMysdE4+ZmZWOicfMzMrnZOPmZmVzsnHzMxK5+RjZmalc/IxM7PS9Sz5SNpI0o8l/UvSJElPSvqdpHc0KbuppFskTZb0nKQfSlqsSblFJZ0i6RlJUyTdIWnLcv4iMzNrVy+vfI4GPg1cC3wZOAsYDfxD0lq1QpLWBa4DhgGHA2cDnwcuarLP84DDgF/nfc4ErpK0Sbf+CDMz69xCPTz294HdImJ6bYGki4B/khLTPnnxycDLwOiImJjLjQV+IenDEXF9XrYx8FngsIg4PS+7ALgfOAX4YAl/k5mZtaFnVz4RcVt94snL/gM8AKwFIGlJYGvgglriyS4AJgI71y3bEXiddGVU299U4Bxgc0mjuvF3mJlZ53p55TMXSQKWB+7Ni95NivGu+nIRMV3SGGC9usXrAQ81JCmAOwEB6wLPNjnmuH7CWqrtP8DMzNpStdZuuwNvBS7Ov9euVuZKGnnZCnW/j+qjHA1lzcyshypz5SNpTeDHwC3Ar/Li4fl5WpNNptatr5VtVY6GsrNExMh+4hqHr37MzApViSsfSW8BrgReBXaKiJl51ZT8vGiTzYbVra+VbVWOhrJmZtZDPb/ykbQUcBXp6mKziHiubnWtyqxZY4FRwDMNZVuVo6GsmZn1UE+vfCQNA/4IrA5sGxH/bihyP/AGsGHDdouQGhCMqVs8BlhT0oiGfbwvP9+LmZlVQi9HOFiQ1FF0E1JV2x2NZSJiPKkT6p4NSWVPYARwSd2yS4GFgQPqjrEosC9wa0T4ysfMrCJ6We12GrA96cpnaUl71K2bGBGX55+/DtwG3CjpbGBF4CvAVRFxbW2DiPibpEuA7+Y+PY8CewMrMbvDqpmZVUAvk8+6+Xm7/Kj3BHA5QETcI2kr0igFPwAmAL8Ajmmyz72AE/Pzm4D7gG0i4tbCozczswHrWfKJiNEdlL0F2KyNclOBI/PDzMwqqhJNrc3MbP7i5GNmZqVz8jEzs9I5+ZiZWemcfMzMrHROPmZmVjonHzMzK52Tj5mZlc7Jx8zMSufkY2ZmpXPyMTOz0jn5mJlZ6Zx8zMysdE4+ZmZWOicfMzMrnZOPmZmVzsnHzMxK5+RjZmalc/IxM7PSOfmYmVnpnHzMzKx0Tj5mZlY6Jx8zMyudk4+ZmZXOycfMzErn5GNmZqVz8jEzs9I5+ZiZWek6Sj6SHpO0fR/rt5X0WAf7GyXpO5JukPSapJA0ukm5sXld4+M7TcqOlHSWpBclTZJ0vaR12/4jzcys6xbqsPzKwIg+1i8OrNTB/tYAjgYeAe4DNu2j7N3A6Q3L7q//RdICwJXAu4FTgZeBg4EbJW0QEY92EJuZmXVJp8mnP8sDkzsofzewbES8LOlTwB/6KPt0RPy6n/3tSEpgO0TE5QCSLgYeBo4H9uogNjMz65J+k4+kDwKj6xZ9WtI7mhRdGvgsMKbdg0fEa+2WzbEsCiwYEa0S3I7AM8AVdcd4MSegXSUtHBGvd3JMMzMrXjtXPluQrhoAAvh0fjTzCHBYAXE18xFgErBgvq90SkSc1VBmPeDuiIiG5XcCBwLvAB7sUnxmZtamdpLP6cB5gIDHgEOpu7LIApgYEa8UGt1s9wE3k6rPlgM+B/xc0tIRUd/oYBRwfZPtn83PK9CQfCSN6+fYSw0o4i67/ZSPlXasTY7+c2nHMrP5Q7/JJyLGA+MBJG0BPBgRL3Q7sIYY5mhhJ+lc4BbgG5J+mmMEGA5Ma7KLqXXrzcysxzpqah0RN5WdeFrEMYN0RbYYsEndqinAok02GVa3vnFfI/t6kBOvmZkVp+PWbpLeDnweeCewDKk6rl5ExJYFxNafp/Lz0nXLniVVvTWqLXumqxGZmVlbOko+kj5Oag69CDCR1I+mV1bNzy/WLRsDbCpJDY0O3keK95GygjMzs9Y6HV7n28BLwMYRsWRErNLsUWSAkpbOnUfrlw0DjgReA26vW3UpqVHBJ+vKLgvsBFzhZtZmZtXQabXbmsCxEXFXUQFIOjb/uFZ+3lPS5sC4iDgT2B74uqRLgbGkqr69gdWBgyJiYt3uLgXuAC6QdCopUR5MSrInFBWzmZkNTqfJ50VgesExnNjw+375+QngTOCfwEPAnqRm1tOAe4CvRMSf6jeMiBmStgG+B3yJ1LrtTmCviHCVm5lZRXSafH4FfAY4o6gAIqKxwULj+ruB7TrY36vAAflhZmYV1GnyOQ/YQtIVwA+Bx4EZjYUi4snBh2ZmZkNVp8nnIdJoBgK27aPcggOOyMzMhrxOk8+3SMnHzMxswDpKPhFxQpfiMDOz+Yin0TYzs9J1OsLBB9spFxF/HVg4ZmY2P+j0ns+NtHfPxw0OzMyspU6Tz74t9rEasA9pBIKfDy4kMzMb6jptcHB+q3WSvkcaecDMzKxPhTU4yCMLnA0cVdQ+zcxsaCq6tdurzJ7qwMzMrKnCkk+e5mBP4Lmi9mlmZkNTp02tf9li1dKk6ayXI82zY2Zm1lKnrd32abH8FeBh4LCIuHBQEdk85dwffbiU4+z7xetLOY6ZlaPT1m4eEcHMzAbNycTMzErXabUbAJKWBLZidsu2x4C/RMRrRQVmZmZDV8fJR9IBwGnACNK8PpCG3Jko6fCIOKfA+MzMbAjqtLXb9sBZpCudbwAP5FVrA18EzpL0QkT8sdAozcxsSOn0yuco4EHgfRExsW75dZLOBe4AjgacfMzMrKVOGxy8FzivIfEAkO/3nJ/LmJmZtdRp8lE/6z3FtpmZ9avTard7gX0k/SQiJtWvkDSC1An13oJiM2vLUedsUdqxvrv/DaUdy2wo6zT5fA/4PXCPpDOAf+XltQYH7wA+XVx4ZmY2FHU6wsHlkg4BTgF+xOxqNgGTgEMi4opiQzSrvq0uOKa0Y12717dLO5ZZt3TczycifiLpQmBrYJW8uNbJdHyRwZlZZz5y7s9KO9Y1+36htGPZ0DOgEQ4iYhxwScGxmJnZfKLf1m6SFpT0HUl9nuZIOkjSyZL6axFnZmbzuXaaWu9BmqPn7/2Uu5PUwXTXdg8uaVRObDdIek1SSBrdouz2ku6RNFXSk5KOlzTXlZukkZLOkvSipEmSrpe0brsxmZlZ97WTfHYGro2Iu/sqlNdfTQfJB1iDlLBWBO5rVUjSx4HLSfMGfTH/fBzwg4ZyCwBXAp8lNYg4ClgeuFHSah3EZWZmXdTOPZ8NSAOJtuMG4PAOjn83sGxEvCzpU8AfWpQ7FfgH8NGImAEgaQJwjKQzIuI/udyOwKbADhFxeS53MWmiu+OBvTqIzczMuqSdK5+lgRfa3N+LuXxbIuK1iHi5rzKS3gW8C/h5LfFkPyHF/5m6ZTsCzwCzmntHxIvAxcCnJC3cbmxmZtY97SSf14Bl29zfMsBc474N0nr5+a76hRHxDPB03fpa2bsjonGYnzuBJUidYM3MrMfaST4PAB9pc39bM3uahaKMys/PNln3LLBCQ9lW5WgoC4CkcX09gKUGE7yZmc2tneTze2ArSZ/sq1Ce62dr4LIiAqszPD9Pa7Juat36WtlW5Wgoa2ZmPdJO8vk58AhwsaT/J2nl+pWSVpZ0Eum+ysO5fJGm5OdFm6wbVre+VrZVORrKAhARI/t6AB61wcysYP0mn4iYAnwCeBw4BnhU0qu5r82rwKPA1/L6bSNiauu9DUitymxUk3WjSA0M6su2KkdDWTMz65G25vOJiEeAdYEvA7cAM4C35Oeb8/L1I+LRLsQ4Jj9vWL9Q0gqk/kFjGspu0GSUhfeRGkI80oX4zMysQ21PJhcRUyPiRxHxoYhYNiIWyc+j8/K5qrSKEBEPAA8BB0pasG7VQcBM5rzHdCmpUcGs+1OSlgV2Aq6IiNe7EaOZmXVmQAOLFknSsfnHtfLznpI2B8ZFxJl52ZHA/wJXS7oIWAc4hNT35+G63V0K3AFcIOlU4CXgYFKSPaGrf4iZmbWt58kHOLHh9/3y8xPAmQAR8SdJnyaNUvAjUmfWkxq3jYgZkrYhTXr3JVLrtjuBvXLVoZmZVUDPk09EtDUKdh4u5/I2yr0KHJAfZmZWQW3f8zEzMyuKk4+ZmZXOycfMzErn5GNmZqVz8jEzs9I5+ZiZWemcfMzMrHROPmZmVjonHzMzK52Tj5mZlc7Jx8zMStfzsd3MbGj5xLlXlnasK/f9RGnHsmI5+ZjZkHTC+f8t71h7v7W0Yw0VrnYzM7PSOfmYmVnpnHzMzKx0Tj5mZlY6Jx8zMyudk4+ZmZXOycfMzErn5GNmZqVz8jEzs9I5+ZiZWemcfMzMrHQe283MrEvu+O4zpR3r/UetUNqxiuDkY2Y2xD391dtKO9aK39m0rXKudjMzs9I5+ZiZWenmieQjabSkaPFYs6HsppJukTRZ0nOSfihpsV7FbmZmc5vX7vmcDtzdsGzWHT1J6wLXAQ8AhwMrAkcAqwLblRSjmZn1Y15LPjdFxOV9rD8ZeBkYHRETASSNBX4h6cMRcX0JMZqZWT/miWq3epKWkDRX0pS0JLA1cEEt8WQXABOBnUsK0czM+jGvJZ9fAROAKZKukfTuunXvJl3J3VW/QURMB8YA65UWpZmZ9WleqXabDlwKXAW8BLyHdC/nFkkbRcTDwKhc9tkm2z8LbNJsx5LG9XPspQYUsZmZtTRPJJ+IuA2o7yX1v5L+SLrKOR7YHRie101rsoupdevNzKzH5onk00xE3CvpWmDLvGhKfl60SfFhdesb9zOyr+PkKyNf/ZiZFWheu+fT6Clg6fxzrbptVJNyo6hrkm1mZr01ryefVYEX88/3A28AG9YXkLQIsC6p0YGZmVXAPJF8JC3XZNnmwBbA1QARMR64FthT0oi6onsCI4BLSgjVzMzaMK/c87lI0mRSo4OXgHWAA/PPJ9SV+3ouc6Oks0kjHHwFuCoiri01YjMza2meuPIBLgeWIyWSHwOfAS4ENoqIJ2uFIuIeYCtSi7cfAJ8DfgHsVHbAZmbW2jxx5RMRZwBntFn2FmCz7kZkZmaDMa9c+ZiZ2RDi5GNmZqVz8jEzs9I5+ZiZWemcfMzMrHROPmZmVjonHzMzK52Tj5mZlc7Jx8zMSufkY2ZmpXPyMTOz0jn5mJlZ6Zx8zMysdE4+ZmZWOicfMzMrnZOPmZmVzsnHzMxK5+RjZmalc/IxM7PSOfmYmVnpnHzMzKx0Tj5mZlY6Jx8zMyudk4+ZmZXOycfMzErn5GNmZqVz8jEzs9I5+ZiZWemcfMzMrHRDLvlIWlTSKZKekTRF0h2Stux1XGZmNtuQSz7AecBhwK+BLwMzgaskbdLLoMzMbLaFeh1AkSRtDHwWOCwiTs/LLgDuB04BPtjD8MzMLBtqVz47AlqeqGcAACAASURBVK8DZ9cWRMRU4Bxgc0mjehWYmZnNpojodQyFkfQXYPmIeE/D8i2Ba4FtIuKqhnXj+tntUgBLLbXUrAUxbXoh8fZHiy7Sct2MaZNKiQFgwUUXb7lu+vRy4lhkkdYxTJ0+sZQYAIYtMqLp8kmvTy0thsUXHtZy3aTp5bw3ARZfpPn7c9L0N0qMoXXlzdTp5X23DVtETZfPmFZeDAsu2jwGgJg6o7Q4NGxBAMaPHw8QEdH0ImdIVbsBo4D/Nln+bH5eYYD7jfHjx08Y4LaQExgwvqOtpk4ZxCELigFgauebFB3HlCm9jwFgWgXiGM+0nscAMH5K79+f4wsNYTDviwrEUez5z8C/L2a/PZck3XNvaqgln+HQ9JM5tW79HCJiZFcjYvbVVRnHqnIMVYmjCjFUJY4qxFCVOKoQQ1XiKCOGoXbPZwqwaJPlw+rWm5lZjw215PMsqeqtUW3ZMyXGYmZmLQy15DMGWFNS413h9+Xne0uOx8zMmhhqyedSYGHggNoCSYsC+wK3RoSvfMzMKmBINTiIiL9JugT4bu7T8yiwN7ASsE8vYzMzs9mGVPLJ9gJOzM9vAu4j9e+5tadRmZnZLEMu+eQRDY7MDzMzq6AhNcKBmZnNG4ZagwMzM5sHOPmYmVnpnHzMzKx0Tj5mZlY6Jx8zMyvdkGtqXSWSFiLNZ1HeZBrpuGsDmwHvApYFAngJeBC4LSLuLzOeXpO0GLA6c74W/4mIySXHsTpN/icR8fD8FENdLCPr44iIQuerqDpJw4At6eOzClwXEV0dELlX3xdual0gSZsDn2T2P3KJvGoCs99Ml0fELV049rLAF0ida1cDBEwHXs0/jwQWIb2xHgfOB34WES8WHUuO523A9rR+U98K/DEinuzS8d9Oei0+CbwXWLChyAxSB+TLgV9FxBNdimNz0uga2wPLkP4X9QJ4Bfhf4LyIuHkoxpDjGA58htmfkeUbijxP/owAv+/WyUEF3ptrA4eRZl5ekjTa/lPM/qy+CXgbaTT+10jDhn0/Ih4oMIaef184+QxSvrr5HHA46Z/4CnAP8BhzvplWAdYHlib9M08DfhERrxcQw3eA/yElucuAvwB3N45lJ2kFYAPgI8CnSW/8H0XE1wYbQ90xPg58BRhN+tsfpflrsRrpjf1X4NSI+L+Cjr8WcALpS24ccBNwd4sYNgA+RPqg/R44LiIeKiiOjwLfBDYG7if/T/qIY2tgHeAu4NiIuGYoxJDjWBo4mvRltxgp6fcVx3tIX8g/A06JiJcLiqOn780cw+9ISefvwCWk/8m/GmtHJC1ISowfyeU3BC6JiN0KiKEa3xcR4ccgHsATpKkavg2s30b5DYDv5G3GFhTDLaQzOXWwjfI2txT4WtwOTAZ+W3uz9lF2SVKC+G3e5raCYphOSiQfBRZqo/xCwMfyNtMKfC0mAN8H1uhgmzXzNuOHSgx1cdwNfB5Yro3yy5ES1V3AhIJi6Pl7M+/718B7BrDde4FfFxRDJb4vfOUzSJIOAC6IiOkdbrcIsFdEnN2dyMon6STgjIh4ocPt3gx8KSKOLSCG1WOA9y8Gs22TfY2MiHFlb1u1GPK+toqIa8vetmE/PX9v2pycfMzMrHRu7WZmVqLcGGYO0aXGDVXmK58CSNqrcVlEXFByDDNJN0nrY2hs4VVGHB9sXBYRfy05hl82iWG/MmPIcVzfJI4Pz28x5DiOaxLHt0qOoefvzRxH/WdVKYxyP6tV+L7wlU8xzmv4PYBSkw+wHw1vph65kRRHrTlvMHcz525bpeTjtdKV5tsdepJqvC/2bfg9gFKTD9V4bwJs0YNjNur594WvfKxQklZqXBZd6kNj1gm/N6vFycdsCMvVbv8vIq7rdSxm9VztNkQ0u+/Un7LvS83PJG1F6sS5Gmnki9eAR4BrImKu+zIFGg30vDm/pIVJ09s/FhFn9VHuQFK16bFR/rBUw4ER0aVRP+qO8/0ON5lB6iv1L+CqKGDkB0lfGmgMEXH3YI8PvvIpRLObuv2IiNiy4BhmtnvsuhgKP/lodrO/v3giYv+CY5jrZmobMXTlREzSiqThUTZi7mFtIMV5B7BzRPy3C8efCewRERcWve8O4/gCcAapg2XLUSTyCBX3AgdFxDldiGN90ogBCwMXRcRdkjYDfgisl4s9D5zQV5IcZAztflabeR7YJiL+0aMYAngA+GREPD6oGJx8Bk/SGNr7slsKWJkutG6R9N42ir0D+DqwLjApIpbop/xA4hhHe6/FAsAIKL6VjaTT24zhQ6TXoiutjSQtSuqlvxZwFql3+wPARNLfvg6wB2l4pvuBjTvtrNxGDFVJPjcBz0XELm2U/S3wlogo9MZ8bu12NbAo8AYwE9gFuBAYTzoJWBjYhDTUzq4RcXGRMQyUpCVIwySdBzwaEaMHub+57n/1twnpin1j4FTg1ojYdjAxuNqtABGxbl/r8+i9hwJfJn0pXtaFGO7t4/jvBI4FdgWmAd8jvYEKFxEj+1qfx6zak5QElyB9ORcdw6H9xLAFcDwp8TxNGu6oG/YA1gZ2iYhLGtaNJw1geaukG4DfAbsD53Yhjk9LekebZSMiTuxCDO8F2v0ivwU4qQsxfB0YC3yYNAbjWcCvgH8AW0cePTp/Xm8nDf5ZieQTEa8B10n6Aan6crD7G2hDi39KWh44ZrAxFDJGjx8tx0MaSWpO+irwOukLZu0Sj78G6Wz7dVJ97beBZXv0WixIat75CKn++A7g4yXHsBVpsMgZpGbQXwAW7uLx/gTc2GbZG0kjKRcdw8wOHzO69FpMBfZrs+x+wJQuxPAccETd7+vkv3mPJmWPBCZ2670xiL/h3cDxPY7h/cC5g92Pr3y6II/i+xXgEGBx0tnTiRHxYEnHfxfwDVLd9mTgu8BpEfFKGcdviGUhUh+PY0hVjncA/xMRV5cYw0eB40gfmieBg0gfnkGPKN6P9wI/bbPs1cDBXYrjQNIIyr30DOkqsB1rA892IYY3N+z3ufz8TJOyzwDDuxADku7pcJPazf4HSNN/fLOAGH4/iBgujojGflsdc/IpUJ4j4wjSl8hipCudk6KgYfrbOP46pC/ZT5PuK3yHNA/Iq2UcvyGWhYH9ga8CbyfN0/L5iPhLiTFsQ3o9NiZVt3yeNF/NGyWFsAzNv9iaeTaX74bJ0fuJ2q4F9pJ0cvQxRUL+DO0F/KFLcUSLn8s0ocNj1+bX2R84WNIeEfG7Qcaw/gBiWIJ0InmipO1jkM33nXwKkEe+PZJUjTOMNBT7SVHuDJWXkSbpmkCqL/9BL75w8mjdnyPN37Iiqf5+/8G+UTuMYTvSld8GpLmTaiOPl5V0aoaRpnhox+ukG+FD1Smke2A3SNo/Iv7eWEDSRqRm4YuT7kt2w9skvSf/XLs/uaqklxrKzTX+WlFigI0FJI0gXSEfRzqxHUwMKw8whtWAK0gnthsNJga3diuApImkS/QxwMnAf/rbJiLuKziGWvPisaSb2W2EEBsUGUOO42lgFOm1OBHotxl6REwoOIbaazEG+A2pZVM/IcSPioyhLo5jgKvaKP4J0glL0S3/KtHaLceyHall2WKkidzuJ/V3WoJ0/2U10iRyu0fEFV04frMm+GqybNbyov8fg5WbrP8oIhbuYQyHkCbZGzao/Tj5DF5Dm/n+XtCuvKkl3djGsecQBTdlzXF08lrU4ujGF24nuvIl02F/o269L/YGboqIsUXud6AkrQocBWwLrFC36llSA43vRsSjXTr23p1uExHndyOWgcq1LGtFxE09jGEV4IODfW2cfAowFN7URZF0fKfbFHEDtSGGTvswEF0Y46tq74vc5H4r5h5l4dqIeKRbx+0jniVIs4ZOiNSUeL5QhRGlJT3eJIZVS43BycdsaJO0JPAL0tTQCzQpMpPUIvPAiJjYxTgWJ33ntDxGvq8RETGpC8f/M6ka9g/d/DvbiGMf5v7iL/VkVNIJTWIo9CSw3xicfMyGLkkLkPoQbQ78mdajLHw0l9syuvClIOndwN2kLgctO0lKOpZ0Q/3dEfHvgmMYR7rimwr8kZSIrupBQxTDyacQVRjUsyrjmTWbsKuNQAqd0GsAfRgiIj5TZAx1sYwANiUN23JDREyWtBypNeAHSAngXtK9jjFdOP7OpJZRR0REywEtJX2F1B9sl4i4tAtxnEUa5HTNiGh5Ty4ny38B10dEof2e8nBHnyCN9LENqTXiOFIfqAuLfh9a35x8ClCFQT0rNJ5Zf0mwfiIv6M5rMbafGBpFN+q7843Z64CVSH/306R7LleS7ruMI3V3GEFq5fXBKGjE4LoYLgNWiogN2yh7N/B4ROxYZAx534+SvuC/0UbZbwG7RUS7QwINJJ4RpP5wuwJbkkbg+C+pm8SF0cdwVVYM9/Mpxnr9F5ljUM9BD4neKKozntkObZQZSRrnbl260NFvoH0YuuAE0t+6O/ACaXijK4FFgPVrVzr5avFyUnXTJwuOYX2g3fsJfwQ6biTRphWBx9os+3gu3zX5ns8FwAWSlgF2BnYjdRI/QtJDwG8i4uRuxjFfG+z4PH70Ow7SO0kf/umk1kWnAMuVePxSxzPrJ5aRwDdJgzrOAC4C1un1/6iLf+9YUt+d2u+bk27u/0+TsicBL3chhol0NqZaV8Yzy//zw9osexjwSo/+Z28nnQh0bZw7P9LDVz5dImkNUi/7XUhVKqeRxldr7EndreP3ajyzZrG8idlj3Y0g1bGfGBH/KjuWkr2VOTsc135uNvLFw8zucV+kxUg32NsxjS6NZwb8k3S/5QdtlN2G1AG1NHnUg92Az5IS0BvANWXGML9x8ilYrwf1rMB4ZvWxLMPsse4WJ13pnBhdHuuuCn0YsgVJw+bU1P4HzWbo7OasncNzc+v+LNbFGH4F/FzSQRHRcrBVSZ8nTXnw+S7GUjvWaqR7PrsCa+bFt5NqJy6OPsags8Fz8ilIrwf1rNB4ZuTWXLWx7oYze4DVQpvO9uF8ejdoZKNmcZQd21n50Z9WQ80U4Zek+ypnStqW1My5cXid3YGPkYZk6nRG3LZIGkWqjdgV2JD0Nz9Amu/qwuhCZ+PBkPR2YFwUPARVFbi1WwEaBvU8gx4M6lmh8cxOI521LkIax+uk6EHv+SrI/5NJzL76EWk224nM/f9ZGFgsih9ep+cjTtTFMhz4Pml05sa/U6Srv1+S7g0V3ihHabr7D+RjP8Xslm2FjrNYpPwemgycSWqO34tpUWaSRmc/CTi7qBNaJ58CVGFQzwqOZ3Y/afDIduLoSh+bXpN0Hp2PtzfoeVKqTtJbgY+TphdfknTS9hCpw+fTXTzuS8ClpFZsN3frOEXK76HFgc1IJyfduC/YXww35hjeCzxdVBW2k08BqjCoZ4XGMxtLBfrYmDWStFAvqqGLImmNEquumx1/cWDzKGgiSCcfsy6R9FVStc6TPYxhrhEnYj7tyZ/vn8yhl/+b+Z2Tj1mX1FVB3ka6D3dJ2S2o6mKYNbJEN6pb24ij5y0QG0bfqOR8PfMTt3YbIiTN1TooIvbrRSw2y2akVlU7AT8BfijpGlIiuiIippQQwyolHKMdVWiBWPj8VQNRkUQ81ySPEfHhUmPwlc/gVWR+jhsalxV9X6nNOHr+waqaPFjmh0mdGHcgtXibROpJfyFwTUR0s5+PVUglpjOQzm1cVnZjFyefAlRhfo6qqMIHq8okLULqwb8raTbP4cBLpPl0fhsRt/YwPLPSOPnYfK1bfRjaPPbiwKdIY6ptAcyMLkxzYdUi6THg0Ij4317H0kt+o9v87q+kPgxnAEcBpVQR5qq4DwAfATbKi0vtmGw9szJpjMOeyx1/F4y6mV3zCBT1c039uhudfp18bL4WEaNhdh+Gbh9P0uakKrcdgWVJg3n+kXTv5/+6fXwzmPV+P580MssCkq4gzWj7W1J1cP28W0dI2jwiXigyBicfMyAiJgGFdJ5rJGldUsL5LGmempmk8ct+A/y+/qzTrCRfJVX5nk2aa+oLwGXAB0kj0F9PGvLpE6QxI0+k4MFefc/HrEskHUdKOquTziTvJF3h/K7os0ibd+T7jH+l/cn1IiL2LziGh4CbIuLz+fcdSMnnxIg4vqHsL4CPRcTbCo3ByceGmir0YchxzAT+TUo4F0ZEO2Pd9YykvYBnI+IvvY6l1/LIEM9FRLO5lwa779pgoe3OrRURsXTBMUwGvhQRZ+ffVyTN+7VdRFzZUHZ/4GcRsXCRMbjazYaiqgyLv2FE3NPrIDpwHhCS/g58PSKu60UQvWyBWOdGYKakS4HjupCEDoyICwveZyeGkRJgzZSG53pTgQWKDsDJp2RVmJ8jXxk8A5w8FGcTrcrI0PNY4gHYl9TC6QPABaSZWHuhJy0QG3wrx7A5aZqSbk60VyWlVYW52q1kFZqfA9KN799GxJ5lx1AXR6/PcLvGoz0MTtGjKA8mjtwgpaj9zQT26OWVT47hLqA2hUWt8/OtwIsNxVcENih8riknn3JVYX6OHMfipDPcD0XEMT2K4Ua6MU9Ik9GL+9ON0Y092oM1U5HkM5bOp4EpdJxAJ58e6vX8HFVR+DwhTcba60cM1ZEFPI1A9Uj6EPDg/N7i0cnHhhxJh9J/8lkI2B1YlyE8tH5VphGoQgvEKgwA3EjSVsDWwGrAEsBrwCOkwWbnes2GkiF5tjc/yn1K5hAR3+pFLL0WEae3WidpQWAv4GukD/zfgaFcFVaJaQSoRgvE/ej9tA7ArKbNl5KGVlKTIkdKugPYOSL+W2pwDSR9BPhq0ScLvvIpQBVuLOcYGkIo/+Z2Fc5wm8lJZ1/gGNLYWncC34yIP/cyLpv/SFqUdLN/LeAs4NfAA8BEUmvDdUhD3XwOuB/YOCKmdymWDUknYa8Cf42IqXXrdgaOBtYjtdAttK+Rr3yK0fOJsoq+GTgIVTjDnUXSQsD+pOFEVgJuBw6KiGt6GphVQr7fuCTwWonDHO0BrA3sEhGXNKwbT2pxdmueo+t3pOrhuebfGQxJI4E/AZvULX5B0jakvj6/ISWdscBhwDlFHh985WNDlKSFSWeORwNvI32gvxkR1/Y0sJI1q47tTzeqa6vSAjHHsipwJGncsvq+TP8lfSGfGhHtDn0zkOP/CRhRG9S2n7I3khLjdgXHcCZwMCm53UKqDTiINOTPm0lXQt8kTf0+s8VuBheDk8/geX6OapH0RVLnxBWAm0lJZ66ZXucHdX26+jPri6AbN+Gr0gJR0mdINRWLkc7q7yPd5F8CeDdp2vHJwN4RcVnRx88xPAX8NCJObqPsMcDBhY+rJj0B3BIRu9ct24s0ysX1wCciYlqRx2zkardirEyP5+eQ9CqdfbhnABOAfwHnFfVBq8gZ7g9Jr8UY4B5gO0l9nTlGRHyl4Biq4k1tlNkCOI7U8q9bN7cPp7MWiIWT9B7SOHsPA19oNmuspM2AnwIXStowIv7ZhVCWIXWubsezuXzRRgGNJ2S133/W7cQDTj5DyRV0lnxEOttbH7hY0uER8cMC4hjbYRxBd96HItVZr9dmDD1NPt0a7SEiWk5QJ+nDpKTzAVJP9/+hC3X7OY4qtEA8Bnge+EBEjGsR5615UNH7cvnduhDHMKDdBgSvA4t2IYaFmHNsN+p+L2XUFSefISIi9hnIdpJEmszsy6QrhsHq+RluRBQ+CGIJShvPTNLWpKSzKfAUqe7/lxHR7ijLRcXRrAXiF7vYAvFDpLP6pomnJiLGSTqHNMdNt7wtX4n1p+OahEHEUBttZVVJLzUWjoj7ijy47/kUoArzcwyGpANIH8qunoy0OsONCM/gmXVzPDNJHyMlnfeRhs8/mVTlWnbSadYC8ZvdboEoaSqppWO/Lcck7Uu6LzOsC3F0cv+rKx2D+4hBTZZ3JQZf+RRnQ9IYZe0I0oevMG2eRdWbAUyIiKdI9eBdm8OlB2e486zowoyqkj5BSjobAY8DBwLnlz2Qa4sWiJ8rsQXi86SJ/dqxRi7fDVUYdb3nMfjKpwAVGSiw09ZENa8BvwKOrO9gVlBMvTrD7XRYkoiILbsSTI9Juot03+tR0pXOBd1qOttPHD1vgSjpbGAHYN180tWq3NtIjVV+HxGfKyu++Y2TTwEqknz27nQTUoODjUk3VX8eEQcXFEtP+9hIGkN7iXgp0pVYV8Y7q8JoD3VNrScC7VzpREQU3rqq7uRoDHBTm3EU2ghE0srAvaRWnkcCl9VXOeb37WeA75HeG++NiMaRQ6wgrnYbIiLi/IFuK2kcsAvpxvOgNDnD3afsM9yI6LMhQ+7dfSipkUWQ5q7vhiqM9nABFRnPjB63QIyIsbnJ/cWkHvznSPo3s/v5rEFqifYSaTrpriSe3J+mMbYLunGsSsfgK5/Bq8KVz2BI+jSpV/egW1hV4Qy3lZx0Dge+SOqXdRlwYkQ8UMbxrRokvYnUkm1b0vhqtdGkHyKNcPCz6OIkj006/pY+0nglYnDyGbwqzM9RhTOZHEen9xO6/qaXtDTpLPoQUnPmi0lJ58FuHteqJXeAfjEipvQ6FnPyKVyv5ueowplM1UhaFjiCVJ24GGkcq5Mi4qGSjt/z0R4G0Aqy8P4cVSFpBrDnvFpDMdT4nk9Bej0/xzzasbIrJL2ZdEP5C6Q6/N+Sks7DJYcylt6P9tBu4wuY3cejlMYX/ehGC8Rmn0vrESefAuT5Oa4i1R//nL7n57hSUtfm5zAgdfYdTvriPRn4DzCsr6uALp3t93y0ByrQnyNbmg5bIHY1mh6RdC/wHdJo0W31s8pdFnYBjoqIdvsSVjoGcPIpSs/n56iKipzhLpaf1yPd3+lL1872qzCe2WBaQRapQi0Qe53ULgLOBM6UdDlwLWnw28drJ6T5ZHYVUsf1rYDtSXGfOoRi8D2fIvR6fo6qnMnk/fa8j80A+jyV9iXtGVXnVGYLxHxf9EVgUpubRESs1oU4lgQOAPYmTeNQ+7xMJZ0M1QYSFWkm03NJY++1HCR2nozByWfwej0/h6SvMbtPRMdnMhHx7aJiaSPW+jPcJUkd/XYu6/i90sPRHirRCrJRL1og5uRzN2kE77ZExA7digdmdXzdFFiT2VMnvExq9n17GZ1cexZDRPgxyAdpKPJ92iy7DzC5CzEsSTqDvBeYSRq7bQbpLG9y3e8zScPFHwYsVeJrNBL4FmmGxNdJ1Y9r9/p/V8LfvTCptd0T+bW/GdiqxOPPbHjM6PHrsSzpKn0CacSFXwNrlvha7Nbr94Qf6eF7PsXo+fwcETEB+D7w/SqcTdX06Ay3Emf7FRntoRKtICvUAtEqwtVuBciX88eQWrz15xOkD92Q7oPTyz42VenzVOXRHsomaSJzt0DsUxQ9f8w8PhLJUOPkU4AqzM9RFT7Dna2Koz30SsNr0d9npZtz2Dj5VISr3YpRlb4UVVCVPjY91+sqryq1gqQCn5Fe/z9sTr7ysUJV4QzXknmpFWQZqnIv0BInHytUr/vYVOxsv+eq0J+jKqpyL9ASJ58C+IyqOqp0tl+R0R5m6WUrSH9GrJGTTwF8RlUtVTnbr8JoD1Xhz4g1cvKxQlXtDLdKfZ4aza+jPZiBk48VzGe4/fOMquXzvcDqcVNrK5Sbs7bmGVV7qhIjOdtsvvIZJJ9RWX96PaNqr1XlM1KVe4GWOPkMUpVaV1m1eLSHpIqfkSrfC5xfOPkUwGdUSVXOcKuiCuOZVYU/I9bIyadg8/MZVRXPcHvJoz00Nz9/Rmw2Jx8rlM9wZ+v1aA9mVebkY13jM1wza8XJx8zMSud+PmZdUrXRHsyqxFc+Zl3i0R7MWnPyMTOz0nkoFDMzK52Tj5mZlc7Jx6wLJN0radc8gkO72ywkafc8UoTZkObWbmbd4VGUzfrgBgdmXeLRHsxac/IxK4FHezCbk5OPmZmVzg0OzMysdE4+ZmZWOicfMzMrnZOPmc0i6deS2pqF1mwwnHzMukzSxySFpJOarHt/XjdN0mJN1v9Z0kxJy5YTrVk5nHzMuu8W4A1gdJN1W+R1i5CaYs+SR0fYHLg/Il7qcoxmpXLyMeuyiJgI/B3YqMnVzWjgL8BzzJ2cNgIWB24sIg5JwyV5SgerBCcfs3LcQLq62ay2IF/ZbAbclB9bNGwzum7b2jbrSrpC0iuSpkp6QNJXJM3xWa7du5H0ZknnSXoBmASMyuuHSzpN0rOSpkj6m6StCv6bzVry2G5m5bgB+Bqzr3Rg9pXNTcAE4IeSFo+ISXn9aNKQPDcBSHpf3s804MfA88AnSWPBvYc0jE89kcaUexr4FjACmJzXXQxsC1yR43kHcDngkRasFE4+ZuW4FZjOnFc3o4GJwF3AeGBh0pXQNXVXRfdFxCu5/Bm5zEYR8QCApDOBS4G9JP0yIm6q2/8CwD0RsU99IJK2ISWecyLigLrltwKXADOK+IPN+uJqN7MSRMQU4G/AhpIWz4tHA7dFxBsR8SDwArOr2mpXRTcASFoB2Bj4Qy3x5P0GcHL+dYcmh242Qvan8vP3GmK8FHi0oz/MbICcfMzKcwPpymXzhvs9NX9l9pXR6Px8Y35eJT8/wNz+lZ9XbbLuP02WrUpqYfdIk3UPNllmVjgnH7Py1BoOjGbO+z01N5GujEbkMjNJCWmgZkTEtEFsb9Y1Tj5m5bmdNJfPFqTkMoXUBLvmJtJ92NGkq6IxEfFqXvdYfl67yX7XaijTn8fycd7Rx77MusrJx6wk+SrkdmAD0g3/22uzmmb3k+b4OZKG/j0R8SxwJ/ApSbMShCQBx+Rf/9BmKFfk5yPrF0raEVitzX2YDYpbu5mV6wbSlc+mwPH1KyIiJN3M7AYBNzRs+6W87BZJPyE1td4e2Bq4oKGlW0sRcaWkq4D987A91wDvBD5HSoC++rGu85WPWbnqE0qzZFFbNgO4uX5FRPyNVB13K3AIcBqwIukKZr8O49gROB14f97PpqSkd2+H+zEbEM9kamZmzm4higAAAENJREFUpfOVj5mZlc7Jx8zMSufkY2ZmpXPyMTOz0jn5mJlZ6Zx8zMysdE4+ZmZWOicfMzMrnZOPmZmVzsnHzMxK9/8BKg5riytrCUgAAAAASUVORK5CYII=\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS Bigrams (FALSE)\"))" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
(NN, NNS)(NNS, VBP)(VBP, JJ)(JJ, JJ)(JJ, NN)(NN, NN)(NN, CC)(CC, JJ)(NN, JJ)(JJ, VBZ)...(WDT, MD)(WRB, MD)(MD, DT)(NNS, JJR)(JJR, EX)(VBP, MD)(JJS, WRB)(CD, RB)(JJS, VBG)(RP, TO)
PoN
N1121221211...0000000000
N1000243000...0000000000
N0100240100...0000000000
N0000410100...0000000000
N0000101000...0000000000
\n", "

5 rows × 389 columns

\n", "
" ], "text/plain": [ " (NN, NNS) (NNS, VBP) (VBP, JJ) (JJ, JJ) (JJ, NN) (NN, NN) (NN, CC) \\\n", "PoN \n", "N 1 1 2 1 2 2 1 \n", "N 1 0 0 0 2 4 3 \n", "N 0 1 0 0 2 4 0 \n", "N 0 0 0 0 4 1 0 \n", "N 0 0 0 0 1 0 1 \n", "\n", " (CC, JJ) (NN, JJ) (JJ, VBZ) ... (WDT, MD) (WRB, MD) (MD, DT) \\\n", "PoN ... \n", "N 2 1 1 ... 0 0 0 \n", "N 0 0 0 ... 0 0 0 \n", "N 1 0 0 ... 0 0 0 \n", "N 1 0 0 ... 0 0 0 \n", "N 0 0 0 ... 0 0 0 \n", "\n", " (NNS, JJR) (JJR, EX) (VBP, MD) (JJS, WRB) (CD, RB) (JJS, VBG) \\\n", "PoN \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "\n", " (RP, TO) \n", "PoN \n", "N 0 \n", "N 0 \n", "N 0 \n", "N 0 \n", "N 0 \n", "\n", "[5 rows x 389 columns]" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df['bow_pos'] = all_df.apply(lambda x: Counter(x['pos_sent_bi']), axis=1)\n", "\n", "new_df = pd.DataFrame(all_df['bow_pos'].tolist(), all_df['PoN'])\n", "new_df = new_df.fillna(0).astype(int)\n", "new_df[:5]" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.4444444444444444\n" ] } ], "source": [ "get_NB(new_df, new_df.index)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.48148148148148145\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
(NN, NNS)(NNS, VBP)(VBP, JJ)(JJ, JJ)(JJ, NN)(NN, NN)(NN, CC)(CC, JJ)(NN, JJ)(JJ, VBZ)...(WDT, MD)(WRB, MD)(MD, DT)(NNS, JJR)(JJR, EX)(VBP, MD)(JJS, WRB)(CD, RB)(JJS, VBG)(RP, TO)
PoN
N0.0192310.0192310.0384620.0192310.0384620.0384620.0192310.0384620.0192310.019231...0.00.00.00.00.00.00.000000.0000000.0000000.00000
N0.0096150.0000000.0000000.0000000.0192310.0384620.0288460.0000000.0000000.000000...0.00.00.00.00.00.00.000000.0000000.0000000.00000
N0.0000000.0227270.0000000.0000000.0454550.0909090.0000000.0227270.0000000.000000...0.00.00.00.00.00.00.000000.0000000.0000000.00000
N0.0000000.0000000.0000000.0000000.0571430.0142860.0000000.0142860.0000000.000000...0.00.00.00.00.00.00.000000.0000000.0000000.00000
N0.0000000.0000000.0000000.0000000.0285710.0000000.0285710.0000000.0000000.000000...0.00.00.00.00.00.00.000000.0000000.0000000.00000
..................................................................
P0.0000000.0238100.0238100.0000000.0714290.0238100.0238100.0000000.0238100.000000...0.00.00.00.00.00.00.023810.0000000.0000000.00000
P0.0000000.0000000.0000000.0000000.0000000.0434780.0000000.0000000.0000000.000000...0.00.00.00.00.00.00.000000.0000000.0000000.00000
P0.0000000.0000000.0000000.0000000.0306120.0204080.0204080.0102040.0102040.000000...0.00.00.00.00.00.00.000000.0102040.0000000.00000
P0.0000000.0064940.0000000.0129870.0389610.0714290.0259740.0000000.0000000.000000...0.00.00.00.00.00.00.000000.0000000.0064940.00000
P0.0000000.0000000.0238100.0000000.1190480.0476190.0238100.0238100.0000000.000000...0.00.00.00.00.00.00.023810.0000000.0000000.02381
\n", "

90 rows × 389 columns

\n", "
" ], "text/plain": [ " (NN, NNS) (NNS, VBP) (VBP, JJ) (JJ, JJ) (JJ, NN) (NN, NN) (NN, CC) \\\n", "PoN \n", "N 0.019231 0.019231 0.038462 0.019231 0.038462 0.038462 0.019231 \n", "N 0.009615 0.000000 0.000000 0.000000 0.019231 0.038462 0.028846 \n", "N 0.000000 0.022727 0.000000 0.000000 0.045455 0.090909 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.057143 0.014286 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.028571 0.000000 0.028571 \n", ".. ... ... ... ... ... ... ... \n", "P 0.000000 0.023810 0.023810 0.000000 0.071429 0.023810 0.023810 \n", "P 0.000000 0.000000 0.000000 0.000000 0.000000 0.043478 0.000000 \n", "P 0.000000 0.000000 0.000000 0.000000 0.030612 0.020408 0.020408 \n", "P 0.000000 0.006494 0.000000 0.012987 0.038961 0.071429 0.025974 \n", "P 0.000000 0.000000 0.023810 0.000000 0.119048 0.047619 0.023810 \n", "\n", " (CC, JJ) (NN, JJ) (JJ, VBZ) ... (WDT, MD) (WRB, MD) (MD, DT) \\\n", "PoN ... \n", "N 0.038462 0.019231 0.019231 ... 0.0 0.0 0.0 \n", "N 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 \n", "N 0.022727 0.000000 0.000000 ... 0.0 0.0 0.0 \n", "N 0.014286 0.000000 0.000000 ... 0.0 0.0 0.0 \n", "N 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 \n", ".. ... ... ... ... ... ... ... \n", "P 0.000000 0.023810 0.000000 ... 0.0 0.0 0.0 \n", "P 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 \n", "P 0.010204 0.010204 0.000000 ... 0.0 0.0 0.0 \n", "P 0.000000 0.000000 0.000000 ... 0.0 0.0 0.0 \n", "P 0.023810 0.000000 0.000000 ... 0.0 0.0 0.0 \n", "\n", " (NNS, JJR) (JJR, EX) (VBP, MD) (JJS, WRB) (CD, RB) (JJS, VBG) \\\n", "PoN \n", "N 0.0 0.0 0.0 0.00000 0.000000 0.000000 \n", "N 0.0 0.0 0.0 0.00000 0.000000 0.000000 \n", "N 0.0 0.0 0.0 0.00000 0.000000 0.000000 \n", "N 0.0 0.0 0.0 0.00000 0.000000 0.000000 \n", "N 0.0 0.0 0.0 0.00000 0.000000 0.000000 \n", ".. ... ... ... ... ... ... \n", "P 0.0 0.0 0.0 0.02381 0.000000 0.000000 \n", "P 0.0 0.0 0.0 0.00000 0.000000 0.000000 \n", "P 0.0 0.0 0.0 0.00000 0.010204 0.000000 \n", "P 0.0 0.0 0.0 0.00000 0.000000 0.006494 \n", "P 0.0 0.0 0.0 0.02381 0.000000 0.000000 \n", "\n", " (RP, TO) \n", "PoN \n", "N 0.00000 \n", "N 0.00000 \n", "N 0.00000 \n", "N 0.00000 \n", "N 0.00000 \n", ".. ... \n", "P 0.00000 \n", "P 0.00000 \n", "P 0.00000 \n", "P 0.00000 \n", "P 0.02381 \n", "\n", "[90 rows x 389 columns]" ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bi_df_norm = new_df.copy()\n", "bi_df_norm = bi_df_norm.apply(lambda x: x/x.sum(), axis=1)\n", "# bi_df_norm = bi_df_norm.apply(lambda x: x/x.sum(), axis=1)\n", "bi_df_norm\n", "\n", "get_NB(bi_df_norm, bi_df_norm.index)\n", "bi_df_norm" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
11(DT, NN)245
38(JJ, NN)151
0(NN, NN)125
44(NN, IN)114
1(NN, VBD)112
43(IN, DT)111
5(NN, CC)77
19(TO, VB)76
37(DT, JJ)62
26(RB, JJ)61
105(PRP, VBD)60
29(IN, NN)57
2(VBD, DT)55
25(VBD, RB)45
75(PRP$, NN)44
12(NN, VBZ)42
83(NN, RB)38
86(IN, JJ)37
39(CC, DT)36
65(JJ, CC)34
\n", "
" ], "text/plain": [ " word count\n", "11 (DT, NN) 245\n", "38 (JJ, NN) 151\n", "0 (NN, NN) 125\n", "44 (NN, IN) 114\n", "1 (NN, VBD) 112\n", "43 (IN, DT) 111\n", "5 (NN, CC) 77\n", "19 (TO, VB) 76\n", "37 (DT, JJ) 62\n", "26 (RB, JJ) 61\n", "105 (PRP, VBD) 60\n", "29 (IN, NN) 57\n", "2 (VBD, DT) 55\n", "25 (VBD, RB) 45\n", "75 (PRP$, NN) 44\n", "12 (NN, VBZ) 42\n", "83 (NN, RB) 38\n", "86 (IN, JJ) 37\n", "39 (CC, DT) 36\n", "65 (JJ, CC) 34" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_p.sort_values(by=[\"count\"], ascending=False)[:20]" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
11(DT, NN)230
4(JJ, NN)145
24(NN, IN)122
49(NN, VBD)106
5(NN, NN)102
32(IN, DT)96
33(DT, JJ)76
44(IN, NN)65
20(TO, VB)59
12(NN, VBZ)56
14(RB, JJ)53
59(PRP, VBD)48
6(NN, CC)48
60(VBD, RB)47
79(NN, DT)41
52(VBD, DT)37
28(IN, PRP)36
61(JJ, CC)35
74(VBD, JJ)34
57(PRP$, NN)33
\n", "
" ], "text/plain": [ " word count\n", "11 (DT, NN) 230\n", "4 (JJ, NN) 145\n", "24 (NN, IN) 122\n", "49 (NN, VBD) 106\n", "5 (NN, NN) 102\n", "32 (IN, DT) 96\n", "33 (DT, JJ) 76\n", "44 (IN, NN) 65\n", "20 (TO, VB) 59\n", "12 (NN, VBZ) 56\n", "14 (RB, JJ) 53\n", "59 (PRP, VBD) 48\n", "6 (NN, CC) 48\n", "60 (VBD, RB) 47\n", "79 (NN, DT) 41\n", "52 (VBD, DT) 37\n", "28 (IN, PRP) 36\n", "61 (JJ, CC) 35\n", "74 (VBD, JJ) 34\n", "57 (PRP$, NN) 33" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_n.sort_values(by=[\"count\"], ascending=False)[:20]" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [], "source": [ "from nltk import word_tokenize \n", "from nltk.util import ngrams\n", "\n", "text = ['cant railway station', 'citadel hotel', 'police stn']\n", "def get_ngram(line, num):\n", " token = nltk.word_tokenize(line)\n", " grams = list(ngrams(token, num)) \n", " return(grams)\n", "\n", "# all_df['trigrams'] = all_df.apply(lambda x: get_ngram(x[0],3), axis=1)\n", "all_df['trigrams'] = all_df.apply(lambda x: get_ngram(' '.join(x['tokens']),3), axis=1)\n", "all_df['trigrams_pos'] = all_df.apply(lambda x: get_ngram(' '.join(x['pos_sent']),3), axis=1)\n", "\n", "# ' '.join(all_df['tokens'][1])\n", " \n", "# counter = all_df['trigrams_pos']" ] }, { "cell_type": "code", "execution_count": 64, "metadata": {}, "outputs": [], "source": [ "all_df['trigrams_feats'] = all_df.apply(lambda x: ['_'.join(trigram) for trigram in x['trigrams_pos']], axis=1)" ] }, { "cell_type": "code", "execution_count": 65, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['JJ_NN_IN', 'NN_IN_DT', 'NN_IN_NN'], dtype='\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NN_NNS_VBPNNS_VBP_JJVBP_JJ_JJJJ_JJ_NNJJ_NN_NNNN_NN_CCNN_CC_JJCC_JJ_NNJJ_NN_JJNN_JJ_VBZ...RB_NN_NNVBD_RP_PRPPRP_TO_JJTO_JJ_JJCC_VB_NNJJS_WRB_NNNN_VBP_RPVBP_RP_TORP_TO_VBVBD_PRP_CC
PoN
N1111111111...0000000000
N0000210000...0000000000
N0000100000...0000000000
N0000000000...0000000000
N0000000000...0000000000
\n", "

5 rows × 1683 columns

\n", "" ], "text/plain": [ " NN_NNS_VBP NNS_VBP_JJ VBP_JJ_JJ JJ_JJ_NN JJ_NN_NN NN_NN_CC \\\n", "PoN \n", "N 1 1 1 1 1 1 \n", "N 0 0 0 0 2 1 \n", "N 0 0 0 0 1 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "\n", " NN_CC_JJ CC_JJ_NN JJ_NN_JJ NN_JJ_VBZ ... RB_NN_NN VBD_RP_PRP \\\n", "PoN ... \n", "N 1 1 1 1 ... 0 0 \n", "N 0 0 0 0 ... 0 0 \n", "N 0 0 0 0 ... 0 0 \n", "N 0 0 0 0 ... 0 0 \n", "N 0 0 0 0 ... 0 0 \n", "\n", " PRP_TO_JJ TO_JJ_JJ CC_VB_NN JJS_WRB_NN NN_VBP_RP VBP_RP_TO \\\n", "PoN \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "\n", " RP_TO_VB VBD_PRP_CC \n", "PoN \n", "N 0 0 \n", "N 0 0 \n", "N 0 0 \n", "N 0 0 \n", "N 0 0 \n", "\n", "[5 rows x 1683 columns]" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df = pd.DataFrame(all_df['trigrams_feats_bow'].tolist(), all_df['PoN'])\n", "new_df = new_df.fillna(0).astype(int)\n", "new_df[:5]\n" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5185185185185185\n" ] } ], "source": [ "get_NB(new_df, new_df.index)" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5185185185185185\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NN_NNS_VBPNNS_VBP_JJVBP_JJ_JJJJ_JJ_NNJJ_NN_NNNN_NN_CCNN_CC_JJCC_JJ_NNJJ_NN_JJNN_JJ_VBZ...RB_NN_NNVBD_RP_PRPPRP_TO_JJTO_JJ_JJCC_VB_NNJJS_WRB_NNNN_VBP_RPVBP_RP_TORP_TO_VBVBD_PRP_CC
PoN
N0.0192310.0192310.0192310.0192310.0192310.0192310.0192310.0192310.0192310.019231...0.0000000.0000000.0000000.0000000.0000000.000000.000000.000000.000000.00000
N0.0000000.0000000.0000000.0000000.0188680.0094340.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.000000.000000.000000.000000.00000
N0.0000000.0000000.0000000.0000000.0232560.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.000000.000000.000000.000000.00000
N0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.000000.000000.000000.000000.00000
N0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.000000.000000.000000.000000.00000
..................................................................
P0.0000000.0243900.0000000.0000000.0000000.0243900.0000000.0000000.0243900.000000...0.0000000.0000000.0000000.0000000.0000000.000000.000000.000000.000000.00000
P0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.000000.000000.000000.000000.00000
P0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.000000.000000.000000.000000.00000
P0.0000000.0000000.0000000.0123460.0308640.0123460.0000000.0000000.0000000.000000...0.0061730.0061730.0061730.0061730.0061730.000000.000000.000000.000000.00000
P0.0000000.0000000.0000000.0000000.0487800.0000000.0243900.0243900.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.024390.024390.024390.024390.02439
\n", "

90 rows × 1683 columns

\n", "
" ], "text/plain": [ " NN_NNS_VBP NNS_VBP_JJ VBP_JJ_JJ JJ_JJ_NN JJ_NN_NN NN_NN_CC \\\n", "PoN \n", "N 0.019231 0.019231 0.019231 0.019231 0.019231 0.019231 \n", "N 0.000000 0.000000 0.000000 0.000000 0.018868 0.009434 \n", "N 0.000000 0.000000 0.000000 0.000000 0.023256 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", ".. ... ... ... ... ... ... \n", "P 0.000000 0.024390 0.000000 0.000000 0.000000 0.024390 \n", "P 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "P 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "P 0.000000 0.000000 0.000000 0.012346 0.030864 0.012346 \n", "P 0.000000 0.000000 0.000000 0.000000 0.048780 0.000000 \n", "\n", " NN_CC_JJ CC_JJ_NN JJ_NN_JJ NN_JJ_VBZ ... RB_NN_NN VBD_RP_PRP \\\n", "PoN ... \n", "N 0.019231 0.019231 0.019231 0.019231 ... 0.000000 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", ".. ... ... ... ... ... ... ... \n", "P 0.000000 0.000000 0.024390 0.000000 ... 0.000000 0.000000 \n", "P 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", "P 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 \n", "P 0.000000 0.000000 0.000000 0.000000 ... 0.006173 0.006173 \n", "P 0.024390 0.024390 0.000000 0.000000 ... 0.000000 0.000000 \n", "\n", " PRP_TO_JJ TO_JJ_JJ CC_VB_NN JJS_WRB_NN NN_VBP_RP VBP_RP_TO \\\n", "PoN \n", "N 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", "N 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", "N 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", "N 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", "N 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", ".. ... ... ... ... ... ... \n", "P 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", "P 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", "P 0.000000 0.000000 0.000000 0.00000 0.00000 0.00000 \n", "P 0.006173 0.006173 0.006173 0.00000 0.00000 0.00000 \n", "P 0.000000 0.000000 0.000000 0.02439 0.02439 0.02439 \n", "\n", " RP_TO_VB VBD_PRP_CC \n", "PoN \n", "N 0.00000 0.00000 \n", "N 0.00000 0.00000 \n", "N 0.00000 0.00000 \n", "N 0.00000 0.00000 \n", "N 0.00000 0.00000 \n", ".. ... ... \n", "P 0.00000 0.00000 \n", "P 0.00000 0.00000 \n", "P 0.00000 0.00000 \n", "P 0.00000 0.00000 \n", "P 0.02439 0.02439 \n", "\n", "[90 rows x 1683 columns]" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tri_df_norm = new_df.copy()\n", "tri_df_norm = tri_df_norm.apply(lambda x: x/x.sum(), axis=1)\n", "tri_df_norm\n", "\n", "get_NB(tri_df_norm, tri_df_norm.index)\n", "tri_df_norm" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_sw...bow_no_swpos_sentpos_sent_strpos_no_sw_sentpos_sent_bibow_postrigramstrigrams_postrigrams_featstrigrams_feats_bow
1Twin Trees Cicero NY HUGE salad bar and high q...N[Twin Trees Cicero NY HUGE salad bar and high ...4[twin, trees, cicero, ny, huge, salad, bar, an...53[twin, trees, cicero, ny, huge, salad, bar, hi...32[twin, tree, cicero, ny, huge, salad, bar, and...[twin, tree, cicero, ny, huge, salad, bar, hig......{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...[NN, NNS, VBP, JJ, JJ, NN, NN, CC, JJ, NN, JJ,...[NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN ...[NN, NNS, VBP, JJ, JJ, NN, NN, JJ, NN, JJ, NNS...[(NN, NNS), (NNS, VBP), (VBP, JJ), (JJ, JJ), (...{('NN', 'NNS'): 1, ('NNS', 'VBP'): 1, ('VBP', ...[(twin, trees, cicero), (trees, cicero, ny), (...[(NN, NNS, VBP), (NNS, VBP, JJ), (VBP, JJ, JJ)...[NN_NNS_VBP, NNS_VBP_JJ, VBP_JJ_JJ, JJ_JJ_NN, ...{'NN_NNS_VBP': 1, 'NNS_VBP_JJ': 1, 'VBP_JJ_JJ'...
2The worst restaurant that I have ever eaten in...N[The worst restaurant that I have ever eaten i...5[the, worst, restaurant, that, i, have, ever, ...105[worst, restaurant, ever, eaten, undoubtedly, ...49[the, worst, restaur, that, i, have, ever, eat...[worst, restaur, ever, eaten, undoubtedli, pla......{'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...[DT, JJS, NN, IN, NN, VBP, RB, VBN, IN, VBZ, R...[DT JJS NN IN NN VBP RB VBN IN VBZ RB DT NN VB...[RBS, NN, RB, RB, JJ, NN, VBN, NN, NN, VBD, NN...[(DT, JJS), (JJS, NN), (NN, IN), (IN, NN), (NN...{('DT', 'JJS'): 1, ('JJS', 'NN'): 1, ('NN', 'I...[(the, worst, restaurant), (worst, restaurant,...[(DT, JJS, NN), (JJS, NN, IN), (NN, IN, NN), (...[DT_JJS_NN, JJS_NN_IN, NN_IN_NN, IN_NN_VBP, NN...{'DT_JJS_NN': 1, 'JJS_NN_IN': 1, 'NN_IN_NN': 2...
4I have been to a Asian restaurant in New York ...N[I have been to a Asian restaurant in New York...4[i, have, been, to, a, asian, restaurant, in, ...45[asian, restaurant, new, york, city, menu, wri...23[i, have, been, to, a, asian, restaur, in, new...[asian, restaur, new, york, citi, menu, writte......{'asian': 1, 'restaurant': 1, 'new': 1, 'york'...[NNS, VBP, VBN, TO, DT, JJ, NN, IN, JJ, NN, NN...[NNS VBP VBN TO DT JJ NN IN JJ NN NN DT NN VBZ...[JJ, NN, JJ, NN, NN, NN, VBN, JJ, JJ, VBP, JJ,...[(NNS, VBP), (VBP, VBN), (VBN, TO), (TO, DT), ...{('NNS', 'VBP'): 1, ('VBP', 'VBN'): 1, ('VBN',...[(i, have, been), (have, been, to), (been, to,...[(NNS, VBP, VBN), (VBP, VBN, TO), (VBN, TO, DT...[NNS_VBP_VBN, VBP_VBN_TO, VBN_TO_DT, TO_DT_JJ,...{'NNS_VBP_VBN': 1, 'VBP_VBN_TO': 1, 'VBN_TO_DT...
5The best restaurant I have gone to is when I w...N[The best restaurant I have gone to is when I ...6[the, best, restaurant, i, have, gone, to, is,...71[best, restaurant, gone, went, applebee, frien...30[the, best, restaur, i, have, gone, to, is, wh...[best, restaur, gone, went, applebe, friend, s......{'best': 1, 'restaurant': 2, 'gone': 1, 'went'...[DT, JJS, NN, NN, VBP, VBN, TO, VBZ, WRB, JJ, ...[DT JJS NN NN VBP VBN TO VBZ WRB JJ VBD TO VB ...[RBS, NN, VBN, VBD, JJ, NNS, NN, RB, NN, JJ, V...[(DT, JJS), (JJS, NN), (NN, NN), (NN, VBP), (V...{('DT', 'JJS'): 1, ('JJS', 'NN'): 1, ('NN', 'N...[(the, best, restaurant), (best, restaurant, i...[(DT, JJS, NN), (JJS, NN, NN), (NN, NN, VBP), ...[DT_JJS_NN, JJS_NN_NN, NN_NN_VBP, NN_VBP_VBN, ...{'DT_JJS_NN': 1, 'JJS_NN_NN': 1, 'NN_NN_VBP': ...
6The restaurant looked pretty good the people a...N[The restaurant looked pretty good the people ...3[the, restaurant, looked, pretty, good, the, p...36[restaurant, looked, pretty, good, people, aro...19[the, restaur, look, pretti, good, the, peopl,...[restaur, look, pretti, good, peopl, around, a......{'restaurant': 1, 'looked': 1, 'pretty': 1, 'g...[DT, NN, VBD, RB, JJ, DT, NNS, IN, PRP, DT, NN...[DT NN VBD RB JJ DT NNS IN PRP DT NN CC VBD RB...[NN, VBD, RB, JJ, NNS, IN, NN, VBD, RB, NN, JJ...[(DT, NN), (NN, VBD), (VBD, RB), (RB, JJ), (JJ...{('DT', 'NN'): 5, ('NN', 'VBD'): 3, ('VBD', 'R...[(the, restaurant, looked), (restaurant, looke...[(DT, NN, VBD), (NN, VBD, RB), (VBD, RB, JJ), ...[DT_NN_VBD, NN_VBD_RB, VBD_RB_JJ, RB_JJ_DT, JJ...{'DT_NN_VBD': 3, 'NN_VBD_RB': 1, 'VBD_RB_JJ': ...
..................................................................
87Mikes Pizza High Point NY Service was very slo...P[Mikes Pizza High Point NY Service was very sl...4[mikes, pizza, high, point, ny, service, was, ...43[mikes, pizza, high, point, ny, service, slow,...26[mike, pizza, high, point, ny, servic, wa, ver...[mike, pizza, high, point, ny, servic, slow, q......{'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1...[NNS, VBP, JJ, NN, JJ, NN, VBD, RB, JJ, CC, DT...[NNS VBP JJ NN JJ NN VBD RB JJ CC DT NN VBD JJ...[NNS, VBP, JJ, NN, JJ, NN, JJ, NN, NN, MD, VB,...[(NNS, VBP), (VBP, JJ), (JJ, NN), (NN, JJ), (J...{('NNS', 'VBP'): 1, ('VBP', 'JJ'): 1, ('JJ', '...[(mikes, pizza, high), (pizza, high, point), (...[(NNS, VBP, JJ), (VBP, JJ, NN), (JJ, NN, JJ), ...[NNS_VBP_JJ, VBP_JJ_NN, JJ_NN_JJ, NN_JJ_NN, JJ...{'NNS_VBP_JJ': 1, 'VBP_JJ_NN': 1, 'JJ_NN_JJ': ...
88After I went shopping with some of my friend w...P[After I went shopping with some of my friend ...2[after, i, went, shopping, with, some, of, my,...24[went, shopping, friend, went, dodo, restauran...11[after, i, went, shop, with, some, of, my, fri...[went, shop, friend, went, dodo, restaur, dinn......{'went': 2, 'shopping': 1, 'friend': 1, 'dodo'...[IN, JJ, VBD, VBG, IN, DT, IN, PRP$, NN, PRP, ...[IN JJ VBD VBG IN DT IN PRP$ NN PRP VBD TO VB ...[VBD, VBG, NN, VBD, JJ, NN, NN, VBD, RB, CD, NNS][(IN, JJ), (JJ, VBD), (VBD, VBG), (VBG, IN), (...{('IN', 'JJ'): 1, ('JJ', 'VBD'): 1, ('VBD', 'V...[(after, i, went), (i, went, shopping), (went,...[(IN, JJ, VBD), (JJ, VBD, VBG), (VBD, VBG, IN)...[IN_JJ_VBD, JJ_VBD_VBG, VBD_VBG_IN, VBG_IN_DT,...{'IN_JJ_VBD': 1, 'JJ_VBD_VBG': 1, 'VBD_VBG_IN'...
89I entered the restaurant and a waitress came b...P[I entered the restaurant and a waitress came ...5[i, entered, the, restaurant, and, a, waitress...99[entered, restaurant, waitress, came, blanking...49[i, enter, the, restaur, and, a, waitress, cam...[enter, restaur, waitress, came, blank, look, ......{'entered': 1, 'restaurant': 1, 'waitress': 2,...[NN, VBD, DT, NN, CC, DT, NN, VBD, IN, IN, DT,...[NN VBD DT NN CC DT NN VBD IN IN DT NN VBG CC ...[VBN, NN, NN, VBD, VBG, VBG, JJ, NN, NN, VBD, ...[(NN, VBD), (VBD, DT), (DT, NN), (NN, CC), (CC...{('NN', 'VBD'): 5, ('VBD', 'DT'): 4, ('DT', 'N...[(i, entered, the), (entered, the, restaurant)...[(NN, VBD, DT), (VBD, DT, NN), (DT, NN, CC), (...[NN_VBD_DT, VBD_DT_NN, DT_NN_CC, NN_CC_DT, CC_...{'NN_VBD_DT': 1, 'VBD_DT_NN': 3, 'DT_NN_CC': 1...
90Carlos Plate Shack was the worst dining experi...P[Carlos Plate Shack was the worst dining exper...9[carlos, plate, shack, was, the, worst, dining...155[carlos, plate, shack, worst, dining, experien...88[carlo, plate, shack, wa, the, worst, dine, ex...[carlo, plate, shack, worst, dine, experi, lif......{'carlos': 1, 'plate': 6, 'shack': 1, 'worst':...[NN, NN, NN, VBD, DT, JJS, VBG, NN, IN, PRP$, ...[NN NN NN VBD DT JJS VBG NN IN PRP$ NN IN PRP$...[NN, NN, NN, JJS, VBG, NN, NN, IN, JJ, NN, NN,...[(NN, NN), (NN, NN), (NN, VBD), (VBD, DT), (DT...{('NN', 'NN'): 11, ('NN', 'VBD'): 6, ('VBD', '...[(carlos, plate, shack), (plate, shack, was), ...[(NN, NN, NN), (NN, NN, VBD), (NN, VBD, DT), (...[NN_NN_NN, NN_NN_VBD, NN_VBD_DT, VBD_DT_JJS, D...{'NN_NN_NN': 2, 'NN_NN_VBD': 3, 'NN_VBD_DT': 2...
91Olive Oil Garden was very disappointing. I exp...P[Olive Oil Garden was very disappointing., I e...5[olive, oil, garden, was, very, disappointing,...43[olive, oil, garden, disappointing, expect, go...23[oliv, oil, garden, wa, veri, disappoint, i, e...[oliv, oil, garden, disappoint, expect, good, ......{'olive': 2, 'oil': 2, 'garden': 2, 'disappoin...[JJ, NN, NN, VBD, RB, JJ, NN, VBP, JJ, NN, CC,...[JJ NN NN VBD RB JJ NN VBP JJ NN CC JJ NN IN J...[JJ, NN, NN, NN, VBP, JJ, NN, JJ, NN, JJS, VB,...[(JJ, NN), (NN, NN), (NN, VBD), (VBD, RB), (RB...{('JJ', 'NN'): 5, ('NN', 'NN'): 2, ('NN', 'VBD...[(olive, oil, garden), (oil, garden, was), (ga...[(JJ, NN, NN), (NN, NN, VBD), (NN, VBD, RB), (...[JJ_NN_NN, NN_NN_VBD, NN_VBD_RB, VBD_RB_JJ, RB...{'JJ_NN_NN': 2, 'NN_NN_VBD': 1, 'NN_VBD_RB': 1...
\n", "

90 rows × 27 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "1 Twin Trees Cicero NY HUGE salad bar and high q... N \n", "2 The worst restaurant that I have ever eaten in... N \n", "4 I have been to a Asian restaurant in New York ... N \n", "5 The best restaurant I have gone to is when I w... N \n", "6 The restaurant looked pretty good the people a... N \n", ".. ... .. \n", "87 Mikes Pizza High Point NY Service was very slo... P \n", "88 After I went shopping with some of my friend w... P \n", "89 I entered the restaurant and a waitress came b... P \n", "90 Carlos Plate Shack was the worst dining experi... P \n", "91 Olive Oil Garden was very disappointing. I exp... P \n", "\n", " sentences num_sentences \\\n", "1 [Twin Trees Cicero NY HUGE salad bar and high ... 4 \n", "2 [The worst restaurant that I have ever eaten i... 5 \n", "4 [I have been to a Asian restaurant in New York... 4 \n", "5 [The best restaurant I have gone to is when I ... 6 \n", "6 [The restaurant looked pretty good the people ... 3 \n", ".. ... ... \n", "87 [Mikes Pizza High Point NY Service was very sl... 4 \n", "88 [After I went shopping with some of my friend ... 2 \n", "89 [I entered the restaurant and a waitress came ... 5 \n", "90 [Carlos Plate Shack was the worst dining exper... 9 \n", "91 [Olive Oil Garden was very disappointing., I e... 5 \n", "\n", " tokens num_tokens \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, an... 53 \n", "2 [the, worst, restaurant, that, i, have, ever, ... 105 \n", "4 [i, have, been, to, a, asian, restaurant, in, ... 45 \n", "5 [the, best, restaurant, i, have, gone, to, is,... 71 \n", "6 [the, restaurant, looked, pretty, good, the, p... 36 \n", ".. ... ... \n", "87 [mikes, pizza, high, point, ny, service, was, ... 43 \n", "88 [after, i, went, shopping, with, some, of, my,... 24 \n", "89 [i, entered, the, restaurant, and, a, waitress... 99 \n", "90 [carlos, plate, shack, was, the, worst, dining... 155 \n", "91 [olive, oil, garden, was, very, disappointing,... 43 \n", "\n", " no_sw num_no_sw \\\n", "1 [twin, trees, cicero, ny, huge, salad, bar, hi... 32 \n", "2 [worst, restaurant, ever, eaten, undoubtedly, ... 49 \n", "4 [asian, restaurant, new, york, city, menu, wri... 23 \n", "5 [best, restaurant, gone, went, applebee, frien... 30 \n", "6 [restaurant, looked, pretty, good, people, aro... 19 \n", ".. ... ... \n", "87 [mikes, pizza, high, point, ny, service, slow,... 26 \n", "88 [went, shopping, friend, went, dodo, restauran... 11 \n", "89 [entered, restaurant, waitress, came, blanking... 49 \n", "90 [carlos, plate, shack, worst, dining, experien... 88 \n", "91 [olive, oil, garden, disappointing, expect, go... 23 \n", "\n", " stemmed \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, and... \n", "2 [the, worst, restaur, that, i, have, ever, eat... \n", "4 [i, have, been, to, a, asian, restaur, in, new... \n", "5 [the, best, restaur, i, have, gone, to, is, wh... \n", "6 [the, restaur, look, pretti, good, the, peopl,... \n", ".. ... \n", "87 [mike, pizza, high, point, ny, servic, wa, ver... \n", "88 [after, i, went, shop, with, some, of, my, fri... \n", "89 [i, enter, the, restaur, and, a, waitress, cam... \n", "90 [carlo, plate, shack, wa, the, worst, dine, ex... \n", "91 [oliv, oil, garden, wa, veri, disappoint, i, e... \n", "\n", " stemmed_no_sw ... \\\n", "1 [twin, tree, cicero, ny, huge, salad, bar, hig... ... \n", "2 [worst, restaur, ever, eaten, undoubtedli, pla... ... \n", "4 [asian, restaur, new, york, citi, menu, writte... ... \n", "5 [best, restaur, gone, went, applebe, friend, s... ... \n", "6 [restaur, look, pretti, good, peopl, around, a... ... \n", ".. ... ... \n", "87 [mike, pizza, high, point, ny, servic, slow, q... ... \n", "88 [went, shop, friend, went, dodo, restaur, dinn... ... \n", "89 [enter, restaur, waitress, came, blank, look, ... ... \n", "90 [carlo, plate, shack, worst, dine, experi, lif... ... \n", "91 [oliv, oil, garden, disappoint, expect, good, ... ... \n", "\n", " bow_no_sw \\\n", "1 {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ... \n", "2 {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate... \n", "4 {'asian': 1, 'restaurant': 1, 'new': 1, 'york'... \n", "5 {'best': 1, 'restaurant': 2, 'gone': 1, 'went'... \n", "6 {'restaurant': 1, 'looked': 1, 'pretty': 1, 'g... \n", ".. ... \n", "87 {'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1... \n", "88 {'went': 2, 'shopping': 1, 'friend': 1, 'dodo'... \n", "89 {'entered': 1, 'restaurant': 1, 'waitress': 2,... \n", "90 {'carlos': 1, 'plate': 6, 'shack': 1, 'worst':... \n", "91 {'olive': 2, 'oil': 2, 'garden': 2, 'disappoin... \n", "\n", " pos_sent \\\n", "1 [NN, NNS, VBP, JJ, JJ, NN, NN, CC, JJ, NN, JJ,... \n", "2 [DT, JJS, NN, IN, NN, VBP, RB, VBN, IN, VBZ, R... \n", "4 [NNS, VBP, VBN, TO, DT, JJ, NN, IN, JJ, NN, NN... \n", "5 [DT, JJS, NN, NN, VBP, VBN, TO, VBZ, WRB, JJ, ... \n", "6 [DT, NN, VBD, RB, JJ, DT, NNS, IN, PRP, DT, NN... \n", ".. ... \n", "87 [NNS, VBP, JJ, NN, JJ, NN, VBD, RB, JJ, CC, DT... \n", "88 [IN, JJ, VBD, VBG, IN, DT, IN, PRP$, NN, PRP, ... \n", "89 [NN, VBD, DT, NN, CC, DT, NN, VBD, IN, IN, DT,... \n", "90 [NN, NN, NN, VBD, DT, JJS, VBG, NN, IN, PRP$, ... \n", "91 [JJ, NN, NN, VBD, RB, JJ, NN, VBP, JJ, NN, CC,... \n", "\n", " pos_sent_str \\\n", "1 [NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN ... \n", "2 [DT JJS NN IN NN VBP RB VBN IN VBZ RB DT NN VB... \n", "4 [NNS VBP VBN TO DT JJ NN IN JJ NN NN DT NN VBZ... \n", "5 [DT JJS NN NN VBP VBN TO VBZ WRB JJ VBD TO VB ... \n", "6 [DT NN VBD RB JJ DT NNS IN PRP DT NN CC VBD RB... \n", ".. ... \n", "87 [NNS VBP JJ NN JJ NN VBD RB JJ CC DT NN VBD JJ... \n", "88 [IN JJ VBD VBG IN DT IN PRP$ NN PRP VBD TO VB ... \n", "89 [NN VBD DT NN CC DT NN VBD IN IN DT NN VBG CC ... \n", "90 [NN NN NN VBD DT JJS VBG NN IN PRP$ NN IN PRP$... \n", "91 [JJ NN NN VBD RB JJ NN VBP JJ NN CC JJ NN IN J... \n", "\n", " pos_no_sw_sent \\\n", "1 [NN, NNS, VBP, JJ, JJ, NN, NN, JJ, NN, JJ, NNS... \n", "2 [RBS, NN, RB, RB, JJ, NN, VBN, NN, NN, VBD, NN... \n", "4 [JJ, NN, JJ, NN, NN, NN, VBN, JJ, JJ, VBP, JJ,... \n", "5 [RBS, NN, VBN, VBD, JJ, NNS, NN, RB, NN, JJ, V... \n", "6 [NN, VBD, RB, JJ, NNS, IN, NN, VBD, RB, NN, JJ... \n", ".. ... \n", "87 [NNS, VBP, JJ, NN, JJ, NN, JJ, NN, NN, MD, VB,... \n", "88 [VBD, VBG, NN, VBD, JJ, NN, NN, VBD, RB, CD, NNS] \n", "89 [VBN, NN, NN, VBD, VBG, VBG, JJ, NN, NN, VBD, ... \n", "90 [NN, NN, NN, JJS, VBG, NN, NN, IN, JJ, NN, NN,... \n", "91 [JJ, NN, NN, NN, VBP, JJ, NN, JJ, NN, JJS, VB,... \n", "\n", " pos_sent_bi \\\n", "1 [(NN, NNS), (NNS, VBP), (VBP, JJ), (JJ, JJ), (... \n", "2 [(DT, JJS), (JJS, NN), (NN, IN), (IN, NN), (NN... \n", "4 [(NNS, VBP), (VBP, VBN), (VBN, TO), (TO, DT), ... \n", "5 [(DT, JJS), (JJS, NN), (NN, NN), (NN, VBP), (V... \n", "6 [(DT, NN), (NN, VBD), (VBD, RB), (RB, JJ), (JJ... \n", ".. ... \n", "87 [(NNS, VBP), (VBP, JJ), (JJ, NN), (NN, JJ), (J... \n", "88 [(IN, JJ), (JJ, VBD), (VBD, VBG), (VBG, IN), (... \n", "89 [(NN, VBD), (VBD, DT), (DT, NN), (NN, CC), (CC... \n", "90 [(NN, NN), (NN, NN), (NN, VBD), (VBD, DT), (DT... \n", "91 [(JJ, NN), (NN, NN), (NN, VBD), (VBD, RB), (RB... \n", "\n", " bow_pos \\\n", "1 {('NN', 'NNS'): 1, ('NNS', 'VBP'): 1, ('VBP', ... \n", "2 {('DT', 'JJS'): 1, ('JJS', 'NN'): 1, ('NN', 'I... \n", "4 {('NNS', 'VBP'): 1, ('VBP', 'VBN'): 1, ('VBN',... \n", "5 {('DT', 'JJS'): 1, ('JJS', 'NN'): 1, ('NN', 'N... \n", "6 {('DT', 'NN'): 5, ('NN', 'VBD'): 3, ('VBD', 'R... \n", ".. ... \n", "87 {('NNS', 'VBP'): 1, ('VBP', 'JJ'): 1, ('JJ', '... \n", "88 {('IN', 'JJ'): 1, ('JJ', 'VBD'): 1, ('VBD', 'V... \n", "89 {('NN', 'VBD'): 5, ('VBD', 'DT'): 4, ('DT', 'N... \n", "90 {('NN', 'NN'): 11, ('NN', 'VBD'): 6, ('VBD', '... \n", "91 {('JJ', 'NN'): 5, ('NN', 'NN'): 2, ('NN', 'VBD... \n", "\n", " trigrams \\\n", "1 [(twin, trees, cicero), (trees, cicero, ny), (... \n", "2 [(the, worst, restaurant), (worst, restaurant,... \n", "4 [(i, have, been), (have, been, to), (been, to,... \n", "5 [(the, best, restaurant), (best, restaurant, i... \n", "6 [(the, restaurant, looked), (restaurant, looke... \n", ".. ... \n", "87 [(mikes, pizza, high), (pizza, high, point), (... \n", "88 [(after, i, went), (i, went, shopping), (went,... \n", "89 [(i, entered, the), (entered, the, restaurant)... \n", "90 [(carlos, plate, shack), (plate, shack, was), ... \n", "91 [(olive, oil, garden), (oil, garden, was), (ga... \n", "\n", " trigrams_pos \\\n", "1 [(NN, NNS, VBP), (NNS, VBP, JJ), (VBP, JJ, JJ)... \n", "2 [(DT, JJS, NN), (JJS, NN, IN), (NN, IN, NN), (... \n", "4 [(NNS, VBP, VBN), (VBP, VBN, TO), (VBN, TO, DT... \n", "5 [(DT, JJS, NN), (JJS, NN, NN), (NN, NN, VBP), ... \n", "6 [(DT, NN, VBD), (NN, VBD, RB), (VBD, RB, JJ), ... \n", ".. ... \n", "87 [(NNS, VBP, JJ), (VBP, JJ, NN), (JJ, NN, JJ), ... \n", "88 [(IN, JJ, VBD), (JJ, VBD, VBG), (VBD, VBG, IN)... \n", "89 [(NN, VBD, DT), (VBD, DT, NN), (DT, NN, CC), (... \n", "90 [(NN, NN, NN), (NN, NN, VBD), (NN, VBD, DT), (... \n", "91 [(JJ, NN, NN), (NN, NN, VBD), (NN, VBD, RB), (... \n", "\n", " trigrams_feats \\\n", "1 [NN_NNS_VBP, NNS_VBP_JJ, VBP_JJ_JJ, JJ_JJ_NN, ... \n", "2 [DT_JJS_NN, JJS_NN_IN, NN_IN_NN, IN_NN_VBP, NN... \n", "4 [NNS_VBP_VBN, VBP_VBN_TO, VBN_TO_DT, TO_DT_JJ,... \n", "5 [DT_JJS_NN, JJS_NN_NN, NN_NN_VBP, NN_VBP_VBN, ... \n", "6 [DT_NN_VBD, NN_VBD_RB, VBD_RB_JJ, RB_JJ_DT, JJ... \n", ".. ... \n", "87 [NNS_VBP_JJ, VBP_JJ_NN, JJ_NN_JJ, NN_JJ_NN, JJ... \n", "88 [IN_JJ_VBD, JJ_VBD_VBG, VBD_VBG_IN, VBG_IN_DT,... \n", "89 [NN_VBD_DT, VBD_DT_NN, DT_NN_CC, NN_CC_DT, CC_... \n", "90 [NN_NN_NN, NN_NN_VBD, NN_VBD_DT, VBD_DT_JJS, D... \n", "91 [JJ_NN_NN, NN_NN_VBD, NN_VBD_RB, VBD_RB_JJ, RB... \n", "\n", " trigrams_feats_bow \n", "1 {'NN_NNS_VBP': 1, 'NNS_VBP_JJ': 1, 'VBP_JJ_JJ'... \n", "2 {'DT_JJS_NN': 1, 'JJS_NN_IN': 1, 'NN_IN_NN': 2... \n", "4 {'NNS_VBP_VBN': 1, 'VBP_VBN_TO': 1, 'VBN_TO_DT... \n", "5 {'DT_JJS_NN': 1, 'JJS_NN_NN': 1, 'NN_NN_VBP': ... \n", "6 {'DT_NN_VBD': 3, 'NN_VBD_RB': 1, 'VBD_RB_JJ': ... \n", ".. ... \n", "87 {'NNS_VBP_JJ': 1, 'VBP_JJ_NN': 1, 'JJ_NN_JJ': ... \n", "88 {'IN_JJ_VBD': 1, 'JJ_VBD_VBG': 1, 'VBD_VBG_IN'... \n", "89 {'NN_VBD_DT': 1, 'VBD_DT_NN': 3, 'DT_NN_CC': 1... \n", "90 {'NN_NN_NN': 2, 'NN_NN_VBD': 3, 'NN_VBD_DT': 2... \n", "91 {'JJ_NN_NN': 2, 'NN_NN_VBD': 1, 'NN_VBD_RB': 1... \n", "\n", "[90 rows x 27 columns]" ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }