{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW4 [Deception] PART 2 -- Check with Myle Data" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 1: GET THAT DATA" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "import os\n", "def get_data(file, path):\n", " f=open(path+file)\n", " data = f.read()\n", " f.close()\n", " return data\n", " \n", "def get_data_from_files(path):\n", " results = [get_data(file, path) for file in os.listdir(path)]\n", " return results\n", "\n", "# pos = get_data_from_files('../pos_cornell//')\n", "# neg = get_data_from_files('../neg_cornell/')\n", "\n", "# pos = get_data_from_files('../hw4_lie_false/')\n", "# neg = get_data_from_files('../hw4_lie_true/')\n", "\n", "## TRUE IS NEG!!!!\n", "neg = get_data_from_files('../myle_pos_deceptive/')\n", "pos = get_data_from_files('../myle_pos_truthful/')\n", "neg2 = get_data_from_files('../myle_neg_deceptive/')\n", "pos2 = get_data_from_files('../myle_neg_truthful/')" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoN
0I traveled to Chicago with my husband for a ro...N
1I stayed in the Sofitel Chicago Water Tower ho...N
2This hotel was gorgeous! I really enjoyed my s...N
3This is an absolutely exquisite hotel, at a gr...N
4I recently traveled up to Chicago for business...N
.........
310It's not a bad hotel. It's just so...disappoin...P
311My wife and I brought our daughter downtown fo...P
312Excellent Hotel ! Rooms and service were great...P
313Had a week long stay at the Hilton on south Mi...P
314We stayed at the James hotel for a 40th birthd...P
\n", "

315 rows × 2 columns

\n", "
" ], "text/plain": [ " 0 PoN\n", "0 I traveled to Chicago with my husband for a ro... N\n", "1 I stayed in the Sofitel Chicago Water Tower ho... N\n", "2 This hotel was gorgeous! I really enjoyed my s... N\n", "3 This is an absolutely exquisite hotel, at a gr... N\n", "4 I recently traveled up to Chicago for business... N\n", ".. ... ..\n", "310 It's not a bad hotel. It's just so...disappoin... P\n", "311 My wife and I brought our daughter downtown fo... P\n", "312 Excellent Hotel ! Rooms and service were great... P\n", "313 Had a week long stay at the Hilton on south Mi... P\n", "314 We stayed at the James hotel for a 40th birthd... P\n", "\n", "[315 rows x 2 columns]" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "neg_df = pd.DataFrame(neg)\n", "pos_df = pd.DataFrame(pos)\n", "pos_df['PoN'] = 'P'\n", "neg_df['PoN'] = 'N'\n", "neg_df2 = pd.DataFrame(neg2)\n", "pos_df2 = pd.DataFrame(pos2)\n", "pos_df2['PoN'] = 'P'\n", "neg_df2['PoN'] = 'N'\n", "all_df = neg_df.append(pos_df)\n", "all_df2 = neg_df2.append(pos_df2)\n", "all_df = all_df.append(all_df2)\n", "all_df.reset_index(drop=True,inplace=True)\n", "# all_df2.reset_index(drop=True,inplace=True)\n", "all_df[:-5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 2: TOKENIZE" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize, sent_tokenize\n", "from nltk.sentiment import SentimentAnalyzer\n", "from nltk.sentiment.util import *" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 2a by sentence" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "def get_sentence_tokens(review):\n", " return sent_tokenize(review)\n", " \n", "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n", "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 2b by word" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n", "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokens
0I traveled to Chicago with my husband for a ro...N[I traveled to Chicago with my husband for a r...6[i, traveled, to, chicago, with, my, husband, ...68
1I stayed in the Sofitel Chicago Water Tower ho...N[I stayed in the Sofitel Chicago Water Tower h...6[i, stayed, in, the, sofitel, chicago, water, ...129
2This hotel was gorgeous! I really enjoyed my s...N[This hotel was gorgeous!, I really enjoyed my...7[this, hotel, was, gorgeous, i, really, enjoye...69
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 I traveled to Chicago with my husband for a ro... N \n", "1 I stayed in the Sofitel Chicago Water Tower ho... N \n", "2 This hotel was gorgeous! I really enjoyed my s... N \n", "\n", " sentences num_sentences \\\n", "0 [I traveled to Chicago with my husband for a r... 6 \n", "1 [I stayed in the Sofitel Chicago Water Tower h... 6 \n", "2 [This hotel was gorgeous!, I really enjoyed my... 7 \n", "\n", " tokens num_tokens \n", "0 [i, traveled, to, chicago, with, my, husband, ... 68 \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... 129 \n", "2 [this, hotel, was, gorgeous, i, really, enjoye... 69 " ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 2c Remove if tokens < 1" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokens
0I traveled to Chicago with my husband for a ro...N[I traveled to Chicago with my husband for a r...6[i, traveled, to, chicago, with, my, husband, ...68
1I stayed in the Sofitel Chicago Water Tower ho...N[I stayed in the Sofitel Chicago Water Tower h...6[i, stayed, in, the, sofitel, chicago, water, ...129
2This hotel was gorgeous! I really enjoyed my s...N[This hotel was gorgeous!, I really enjoyed my...7[this, hotel, was, gorgeous, i, really, enjoye...69
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 I traveled to Chicago with my husband for a ro... N \n", "1 I stayed in the Sofitel Chicago Water Tower ho... N \n", "2 This hotel was gorgeous! I really enjoyed my s... N \n", "\n", " sentences num_sentences \\\n", "0 [I traveled to Chicago with my husband for a r... 6 \n", "1 [I stayed in the Sofitel Chicago Water Tower h... 6 \n", "2 [This hotel was gorgeous!, I really enjoyed my... 7 \n", "\n", " tokens num_tokens \n", "0 [i, traveled, to, chicago, with, my, husband, ... 68 \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... 129 \n", "2 [this, hotel, was, gorgeous, i, really, enjoye... 69 " ] }, "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df = all_df.drop(all_df[all_df.num_tokens < 1].index)\n", "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 3: EXPERIMENT\n", "#### Experiment with: stopwords, stemming, lemming etc." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 3a remove english stopwords" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words(\"english\"))\n", "def remove_stopwords(sentence):\n", " filtered_text = []\n", " for word in sentence:\n", " if word not in stop_words:\n", " filtered_text.append(word)\n", " return filtered_text\n", "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n", "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_sw
0I traveled to Chicago with my husband for a ro...N[I traveled to Chicago with my husband for a r...6[i, traveled, to, chicago, with, my, husband, ...68[traveled, chicago, husband, romantic, weekend...40
1I stayed in the Sofitel Chicago Water Tower ho...N[I stayed in the Sofitel Chicago Water Tower h...6[i, stayed, in, the, sofitel, chicago, water, ...129[stayed, sofitel, chicago, water, tower, hotel...71
2This hotel was gorgeous! I really enjoyed my s...N[This hotel was gorgeous!, I really enjoyed my...7[this, hotel, was, gorgeous, i, really, enjoye...69[hotel, gorgeous, really, enjoyed, stay, defin...36
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 I traveled to Chicago with my husband for a ro... N \n", "1 I stayed in the Sofitel Chicago Water Tower ho... N \n", "2 This hotel was gorgeous! I really enjoyed my s... N \n", "\n", " sentences num_sentences \\\n", "0 [I traveled to Chicago with my husband for a r... 6 \n", "1 [I stayed in the Sofitel Chicago Water Tower h... 6 \n", "2 [This hotel was gorgeous!, I really enjoyed my... 7 \n", "\n", " tokens num_tokens \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... 68 \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... 129 \n", "2 [this, hotel, was, gorgeous, i, really, enjoye... 69 \n", "\n", " no_sw num_no_sw \n", "0 [traveled, chicago, husband, romantic, weekend... 40 \n", "1 [stayed, sofitel, chicago, water, tower, hotel... 71 \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... 36 " ] }, "execution_count": 81, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 3b get stems for both tokens and no_sw" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [], "source": [ "from nltk.stem import PorterStemmer\n", "def get_stems(sentence):\n", " ps = PorterStemmer()\n", " return [ps.stem(w) for w in sentence]\n", " \n", "all_df['stemmed'] = all_df.apply(lambda x: get_stems(x['tokens']),axis=1)\n", "all_df['stemmed_no_sw'] = all_df.apply(lambda x: get_stems(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_sw
0I traveled to Chicago with my husband for a ro...N[I traveled to Chicago with my husband for a r...6[i, traveled, to, chicago, with, my, husband, ...68[traveled, chicago, husband, romantic, weekend...40[i, travel, to, chicago, with, my, husband, fo...[travel, chicago, husband, romant, weekend, aw...
1I stayed in the Sofitel Chicago Water Tower ho...N[I stayed in the Sofitel Chicago Water Tower h...6[i, stayed, in, the, sofitel, chicago, water, ...129[stayed, sofitel, chicago, water, tower, hotel...71[i, stay, in, the, sofitel, chicago, water, to...[stay, sofitel, chicago, water, tower, hotel, ...
2This hotel was gorgeous! I really enjoyed my s...N[This hotel was gorgeous!, I really enjoyed my...7[this, hotel, was, gorgeous, i, really, enjoye...69[hotel, gorgeous, really, enjoyed, stay, defin...36[thi, hotel, wa, gorgeou, i, realli, enjoy, my...[hotel, gorgeou, realli, enjoy, stay, definit,...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 I traveled to Chicago with my husband for a ro... N \n", "1 I stayed in the Sofitel Chicago Water Tower ho... N \n", "2 This hotel was gorgeous! I really enjoyed my s... N \n", "\n", " sentences num_sentences \\\n", "0 [I traveled to Chicago with my husband for a r... 6 \n", "1 [I stayed in the Sofitel Chicago Water Tower h... 6 \n", "2 [This hotel was gorgeous!, I really enjoyed my... 7 \n", "\n", " tokens num_tokens \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... 68 \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... 129 \n", "2 [this, hotel, was, gorgeous, i, really, enjoye... 69 \n", "\n", " no_sw num_no_sw \\\n", "0 [traveled, chicago, husband, romantic, weekend... 40 \n", "1 [stayed, sofitel, chicago, water, tower, hotel... 71 \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... 36 \n", "\n", " stemmed \\\n", "0 [i, travel, to, chicago, with, my, husband, fo... \n", "1 [i, stay, in, the, sofitel, chicago, water, to... \n", "2 [thi, hotel, wa, gorgeou, i, realli, enjoy, my... \n", "\n", " stemmed_no_sw \n", "0 [travel, chicago, husband, romant, weekend, aw... \n", "1 [stay, sofitel, chicago, water, tower, hotel, ... \n", "2 [hotel, gorgeou, realli, enjoy, stay, definit,... " ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### -- 3c get lemmas for both tokens and no_sw" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [], "source": [ "from nltk.stem.wordnet import WordNetLemmatizer\n", "def get_lemmas(sentence):\n", " lem = WordNetLemmatizer() \n", " return [lem.lemmatize(w) for w in sentence]\n", " \n", "all_df['lemmed'] = all_df.apply(lambda x: get_lemmas(x['tokens']),axis=1)\n", "all_df['lemmed_no_sw'] = all_df.apply(lambda x: get_lemmas(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_swlemmedlemmed_no_sw
0I traveled to Chicago with my husband for a ro...N[I traveled to Chicago with my husband for a r...6[i, traveled, to, chicago, with, my, husband, ...68[traveled, chicago, husband, romantic, weekend...40[i, travel, to, chicago, with, my, husband, fo...[travel, chicago, husband, romant, weekend, aw...[i, traveled, to, chicago, with, my, husband, ...[traveled, chicago, husband, romantic, weekend...
1I stayed in the Sofitel Chicago Water Tower ho...N[I stayed in the Sofitel Chicago Water Tower h...6[i, stayed, in, the, sofitel, chicago, water, ...129[stayed, sofitel, chicago, water, tower, hotel...71[i, stay, in, the, sofitel, chicago, water, to...[stay, sofitel, chicago, water, tower, hotel, ...[i, stayed, in, the, sofitel, chicago, water, ...[stayed, sofitel, chicago, water, tower, hotel...
2This hotel was gorgeous! I really enjoyed my s...N[This hotel was gorgeous!, I really enjoyed my...7[this, hotel, was, gorgeous, i, really, enjoye...69[hotel, gorgeous, really, enjoyed, stay, defin...36[thi, hotel, wa, gorgeou, i, realli, enjoy, my...[hotel, gorgeou, realli, enjoy, stay, definit,...[this, hotel, wa, gorgeous, i, really, enjoyed...[hotel, gorgeous, really, enjoyed, stay, defin...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 I traveled to Chicago with my husband for a ro... N \n", "1 I stayed in the Sofitel Chicago Water Tower ho... N \n", "2 This hotel was gorgeous! I really enjoyed my s... N \n", "\n", " sentences num_sentences \\\n", "0 [I traveled to Chicago with my husband for a r... 6 \n", "1 [I stayed in the Sofitel Chicago Water Tower h... 6 \n", "2 [This hotel was gorgeous!, I really enjoyed my... 7 \n", "\n", " tokens num_tokens \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... 68 \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... 129 \n", "2 [this, hotel, was, gorgeous, i, really, enjoye... 69 \n", "\n", " no_sw num_no_sw \\\n", "0 [traveled, chicago, husband, romantic, weekend... 40 \n", "1 [stayed, sofitel, chicago, water, tower, hotel... 71 \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... 36 \n", "\n", " stemmed \\\n", "0 [i, travel, to, chicago, with, my, husband, fo... \n", "1 [i, stay, in, the, sofitel, chicago, water, to... \n", "2 [thi, hotel, wa, gorgeou, i, realli, enjoy, my... \n", "\n", " stemmed_no_sw \\\n", "0 [travel, chicago, husband, romant, weekend, aw... \n", "1 [stay, sofitel, chicago, water, tower, hotel, ... \n", "2 [hotel, gorgeou, realli, enjoy, stay, definit,... \n", "\n", " lemmed \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... \n", "2 [this, hotel, wa, gorgeous, i, really, enjoyed... \n", "\n", " lemmed_no_sw \n", "0 [traveled, chicago, husband, romantic, weekend... \n", "1 [stayed, sofitel, chicago, water, tower, hotel... \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... " ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:3]" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "all_df['pos'] = all_df.apply(lambda x: nltk.pos_tag(x['tokens']),axis=1)\n", "all_df['pos_no_sw'] = all_df.apply(lambda x: nltk.pos_tag(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 87, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_swlemmedlemmed_no_swpospos_no_swpos_dictpos_dict_no_sw
0I traveled to Chicago with my husband for a ro...N[I traveled to Chicago with my husband for a r...6[i, traveled, to, chicago, with, my, husband, ...68[traveled, chicago, husband, romantic, weekend...40[i, travel, to, chicago, with, my, husband, fo...[travel, chicago, husband, romant, weekend, aw...[i, traveled, to, chicago, with, my, husband, ...[traveled, chicago, husband, romantic, weekend...[(i, NN), (traveled, VBD), (to, TO), (chicago,...[(traveled, VBN), (chicago, JJ), (husband, NN)...{'NN': 18, 'VBD': 6, 'TO': 1, 'VB': 3, 'IN': 6...{'VBN': 1, 'JJ': 6, 'NN': 16, 'RB': 4, 'MD': 2...
1I stayed in the Sofitel Chicago Water Tower ho...N[I stayed in the Sofitel Chicago Water Tower h...6[i, stayed, in, the, sofitel, chicago, water, ...129[stayed, sofitel, chicago, water, tower, hotel...71[i, stay, in, the, sofitel, chicago, water, to...[stay, sofitel, chicago, water, tower, hotel, ...[i, stayed, in, the, sofitel, chicago, water, ...[stayed, sofitel, chicago, water, tower, hotel...[(i, JJ), (stayed, VBD), (in, IN), (the, DT), ...[(stayed, JJ), (sofitel, NN), (chicago, NN), (...{'JJ': 19, 'VBD': 6, 'IN': 16, 'DT': 14, 'NN':...{'JJ': 15, 'NN': 29, 'CD': 1, 'NNS': 11, 'RB':...
2This hotel was gorgeous! I really enjoyed my s...N[This hotel was gorgeous!, I really enjoyed my...7[this, hotel, was, gorgeous, i, really, enjoye...69[hotel, gorgeous, really, enjoyed, stay, defin...36[thi, hotel, wa, gorgeou, i, realli, enjoy, my...[hotel, gorgeou, realli, enjoy, stay, definit,...[this, hotel, wa, gorgeous, i, really, enjoyed...[hotel, gorgeous, really, enjoyed, stay, defin...[(this, DT), (hotel, NN), (was, VBD), (gorgeou...[(hotel, NN), (gorgeous, JJ), (really, RB), (e...{'DT': 9, 'NN': 15, 'VBD': 6, 'JJ': 10, 'RB': ...{'NN': 15, 'JJ': 9, 'RB': 6, 'VBN': 1, 'VBG': ...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 I traveled to Chicago with my husband for a ro... N \n", "1 I stayed in the Sofitel Chicago Water Tower ho... N \n", "2 This hotel was gorgeous! I really enjoyed my s... N \n", "\n", " sentences num_sentences \\\n", "0 [I traveled to Chicago with my husband for a r... 6 \n", "1 [I stayed in the Sofitel Chicago Water Tower h... 6 \n", "2 [This hotel was gorgeous!, I really enjoyed my... 7 \n", "\n", " tokens num_tokens \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... 68 \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... 129 \n", "2 [this, hotel, was, gorgeous, i, really, enjoye... 69 \n", "\n", " no_sw num_no_sw \\\n", "0 [traveled, chicago, husband, romantic, weekend... 40 \n", "1 [stayed, sofitel, chicago, water, tower, hotel... 71 \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... 36 \n", "\n", " stemmed \\\n", "0 [i, travel, to, chicago, with, my, husband, fo... \n", "1 [i, stay, in, the, sofitel, chicago, water, to... \n", "2 [thi, hotel, wa, gorgeou, i, realli, enjoy, my... \n", "\n", " stemmed_no_sw \\\n", "0 [travel, chicago, husband, romant, weekend, aw... \n", "1 [stay, sofitel, chicago, water, tower, hotel, ... \n", "2 [hotel, gorgeou, realli, enjoy, stay, definit,... \n", "\n", " lemmed \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... \n", "2 [this, hotel, wa, gorgeous, i, really, enjoyed... \n", "\n", " lemmed_no_sw \\\n", "0 [traveled, chicago, husband, romantic, weekend... \n", "1 [stayed, sofitel, chicago, water, tower, hotel... \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... \n", "\n", " pos \\\n", "0 [(i, NN), (traveled, VBD), (to, TO), (chicago,... \n", "1 [(i, JJ), (stayed, VBD), (in, IN), (the, DT), ... \n", "2 [(this, DT), (hotel, NN), (was, VBD), (gorgeou... \n", "\n", " pos_no_sw \\\n", "0 [(traveled, VBN), (chicago, JJ), (husband, NN)... \n", "1 [(stayed, JJ), (sofitel, NN), (chicago, NN), (... \n", "2 [(hotel, NN), (gorgeous, JJ), (really, RB), (e... \n", "\n", " pos_dict \\\n", "0 {'NN': 18, 'VBD': 6, 'TO': 1, 'VB': 3, 'IN': 6... \n", "1 {'JJ': 19, 'VBD': 6, 'IN': 16, 'DT': 14, 'NN':... \n", "2 {'DT': 9, 'NN': 15, 'VBD': 6, 'JJ': 10, 'RB': ... \n", "\n", " pos_dict_no_sw \n", "0 {'VBN': 1, 'JJ': 6, 'NN': 16, 'RB': 4, 'MD': 2... \n", "1 {'JJ': 15, 'NN': 29, 'CD': 1, 'NNS': 11, 'RB':... \n", "2 {'NN': 15, 'JJ': 9, 'RB': 6, 'VBN': 1, 'VBG': ... " ] }, "execution_count": 87, "metadata": {}, "output_type": "execute_result" } ], "source": [ "def get_pos_dict(pos_tuple):\n", " pos_dict = {}\n", " for t in pos_tuple:\n", " if t[1] in pos_dict.keys():\n", " pos_dict[t[1]] += 1\n", " else:\n", " pos_dict.update({t[1]: 1})\n", " return pos_dict\n", "\n", "all_df['pos_dict'] = all_df.apply(lambda x: get_pos_dict(x['pos']), axis=1)\n", "all_df['pos_dict_no_sw'] = all_df.apply(lambda x: get_pos_dict(x['pos_no_sw']), axis=1)\n", "all_df[:3]" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_swlemmedlemmed_no_swpospos_no_swpos_dictpos_dict_no_swbowbow_no_sw
0I traveled to Chicago with my husband for a ro...N[I traveled to Chicago with my husband for a r...6[i, traveled, to, chicago, with, my, husband, ...68[traveled, chicago, husband, romantic, weekend...40[i, travel, to, chicago, with, my, husband, fo...[travel, chicago, husband, romant, weekend, aw...[i, traveled, to, chicago, with, my, husband, ...[traveled, chicago, husband, romantic, weekend...[(i, NN), (traveled, VBD), (to, TO), (chicago,...[(traveled, VBN), (chicago, JJ), (husband, NN)...{'NN': 18, 'VBD': 6, 'TO': 1, 'VB': 3, 'IN': 6...{'VBN': 1, 'JJ': 6, 'NN': 16, 'RB': 4, 'MD': 2...{'i': 1, 'traveled': 1, 'to': 1, 'chicago': 2,...{'traveled': 1, 'chicago': 2, 'husband': 1, 'r...
1I stayed in the Sofitel Chicago Water Tower ho...N[I stayed in the Sofitel Chicago Water Tower h...6[i, stayed, in, the, sofitel, chicago, water, ...129[stayed, sofitel, chicago, water, tower, hotel...71[i, stay, in, the, sofitel, chicago, water, to...[stay, sofitel, chicago, water, tower, hotel, ...[i, stayed, in, the, sofitel, chicago, water, ...[stayed, sofitel, chicago, water, tower, hotel...[(i, JJ), (stayed, VBD), (in, IN), (the, DT), ...[(stayed, JJ), (sofitel, NN), (chicago, NN), (...{'JJ': 19, 'VBD': 6, 'IN': 16, 'DT': 14, 'NN':...{'JJ': 15, 'NN': 29, 'CD': 1, 'NNS': 11, 'RB':...{'i': 3, 'stayed': 1, 'in': 1, 'the': 9, 'sofi...{'stayed': 1, 'sofitel': 1, 'chicago': 1, 'wat...
2This hotel was gorgeous! I really enjoyed my s...N[This hotel was gorgeous!, I really enjoyed my...7[this, hotel, was, gorgeous, i, really, enjoye...69[hotel, gorgeous, really, enjoyed, stay, defin...36[thi, hotel, wa, gorgeou, i, realli, enjoy, my...[hotel, gorgeou, realli, enjoy, stay, definit,...[this, hotel, wa, gorgeous, i, really, enjoyed...[hotel, gorgeous, really, enjoyed, stay, defin...[(this, DT), (hotel, NN), (was, VBD), (gorgeou...[(hotel, NN), (gorgeous, JJ), (really, RB), (e...{'DT': 9, 'NN': 15, 'VBD': 6, 'JJ': 10, 'RB': ...{'NN': 15, 'JJ': 9, 'RB': 6, 'VBN': 1, 'VBG': ...{'this': 2, 'hotel': 2, 'was': 6, 'gorgeous': ...{'hotel': 2, 'gorgeous': 1, 'really': 1, 'enjo...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 I traveled to Chicago with my husband for a ro... N \n", "1 I stayed in the Sofitel Chicago Water Tower ho... N \n", "2 This hotel was gorgeous! I really enjoyed my s... N \n", "\n", " sentences num_sentences \\\n", "0 [I traveled to Chicago with my husband for a r... 6 \n", "1 [I stayed in the Sofitel Chicago Water Tower h... 6 \n", "2 [This hotel was gorgeous!, I really enjoyed my... 7 \n", "\n", " tokens num_tokens \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... 68 \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... 129 \n", "2 [this, hotel, was, gorgeous, i, really, enjoye... 69 \n", "\n", " no_sw num_no_sw \\\n", "0 [traveled, chicago, husband, romantic, weekend... 40 \n", "1 [stayed, sofitel, chicago, water, tower, hotel... 71 \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... 36 \n", "\n", " stemmed \\\n", "0 [i, travel, to, chicago, with, my, husband, fo... \n", "1 [i, stay, in, the, sofitel, chicago, water, to... \n", "2 [thi, hotel, wa, gorgeou, i, realli, enjoy, my... \n", "\n", " stemmed_no_sw \\\n", "0 [travel, chicago, husband, romant, weekend, aw... \n", "1 [stay, sofitel, chicago, water, tower, hotel, ... \n", "2 [hotel, gorgeou, realli, enjoy, stay, definit,... \n", "\n", " lemmed \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... \n", "2 [this, hotel, wa, gorgeous, i, really, enjoyed... \n", "\n", " lemmed_no_sw \\\n", "0 [traveled, chicago, husband, romantic, weekend... \n", "1 [stayed, sofitel, chicago, water, tower, hotel... \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... \n", "\n", " pos \\\n", "0 [(i, NN), (traveled, VBD), (to, TO), (chicago,... \n", "1 [(i, JJ), (stayed, VBD), (in, IN), (the, DT), ... \n", "2 [(this, DT), (hotel, NN), (was, VBD), (gorgeou... \n", "\n", " pos_no_sw \\\n", "0 [(traveled, VBN), (chicago, JJ), (husband, NN)... \n", "1 [(stayed, JJ), (sofitel, NN), (chicago, NN), (... \n", "2 [(hotel, NN), (gorgeous, JJ), (really, RB), (e... \n", "\n", " pos_dict \\\n", "0 {'NN': 18, 'VBD': 6, 'TO': 1, 'VB': 3, 'IN': 6... \n", "1 {'JJ': 19, 'VBD': 6, 'IN': 16, 'DT': 14, 'NN':... \n", "2 {'DT': 9, 'NN': 15, 'VBD': 6, 'JJ': 10, 'RB': ... \n", "\n", " pos_dict_no_sw \\\n", "0 {'VBN': 1, 'JJ': 6, 'NN': 16, 'RB': 4, 'MD': 2... \n", "1 {'JJ': 15, 'NN': 29, 'CD': 1, 'NNS': 11, 'RB':... \n", "2 {'NN': 15, 'JJ': 9, 'RB': 6, 'VBN': 1, 'VBG': ... \n", "\n", " bow \\\n", "0 {'i': 1, 'traveled': 1, 'to': 1, 'chicago': 2,... \n", "1 {'i': 3, 'stayed': 1, 'in': 1, 'the': 9, 'sofi... \n", "2 {'this': 2, 'hotel': 2, 'was': 6, 'gorgeous': ... \n", "\n", " bow_no_sw \n", "0 {'traveled': 1, 'chicago': 2, 'husband': 1, 'r... \n", "1 {'stayed': 1, 'sofitel': 1, 'chicago': 1, 'wat... \n", "2 {'hotel': 2, 'gorgeous': 1, 'really': 1, 'enjo... " ] }, "execution_count": 88, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# def get_bow_from_tokens(df, column):\n", "# all_column_data = ' '.join(df[column].tolist())\n", "# all_column_fd = Counter(all_column_data.split())\n", "# return all_column_fd\n", "\n", "# # bow = get_bow_from_column(all_df, 'diy_cleaner')\n", "# # bow =\n", "from collections import Counter\n", "all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n", "all_df['bow_no_sw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)\n", "all_df[:3]" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "all_df_n = all_df[all_df['PoN'] == 'N']\n", "all_df_p = all_df[all_df['PoN'] == 'P']\n", "\n", "big_bow = [item for review in all_df['bow'].tolist() for item in review]\n", "big_bow_n = [item for review in all_df_n['bow'].tolist() for item in review]\n", "big_bow_p = [item for review in all_df_p['bow'].tolist() for item in review]\n", "\n", "df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()\n", "df = df.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()\n", "df_n = df_n.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()\n", "df_p = df_p.rename(columns={'index':'word', 0:'count'})" ] }, { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "import seaborn as sns\n", "import matplotlib.pyplot as plt \n", "def bar_plot(df, title): \n", " graph = sns.barplot(y = \"count\", x = \"word\", data = df, palette = \"husl\")\n", " plt.title(title)\n", " plt.xlabel(\"Word\")\n", " plt.ylabel(\"Count\")\n", " sns.set_context(\"talk\")\n", " plt.xticks(rotation = 90)\n", " return plt\n", "\n", "print(bar_plot(df.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (ALL) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_n.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (TRUE) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (FALSE) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [], "source": [ "all_df_n = all_df[all_df['PoN'] == 'N']\n", "all_df_p = all_df[all_df['PoN'] == 'P']\n", "\n", "big_bow = [item for review in all_df['bow_no_sw'].tolist() for item in review]\n", "big_bow_n = [item for review in all_df_n['bow_no_sw'].tolist() for item in review]\n", "big_bow_p = [item for review in all_df_p['bow_no_sw'].tolist() for item in review]\n", "\n", "df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()\n", "df = df.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()\n", "df_n = df_n.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()\n", "df_p = df_p.rename(columns={'index':'word', 0:'count'})" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (ALL) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_n.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (TRUE) Stopwords Removed\"))" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:20], \"Top 20 Items (FALSE) Stopwords Removed\"))" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [], "source": [ "all_df_n = all_df[all_df['PoN'] == 'N']\n", "all_df_p = all_df[all_df['PoN'] == 'P']\n", "\n", "big_bow = [item for review in all_df['pos_dict'].tolist() for item in review]\n", "big_bow_n = [item for review in all_df_n['pos_dict'].tolist() for item in review]\n", "big_bow_p = [item for review in all_df_p['pos_dict'].tolist() for item in review]\n", "\n", "df = pd.DataFrame.from_dict(Counter(big_bow), orient='index').reset_index()\n", "df = df.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_n = pd.DataFrame.from_dict(Counter(big_bow_n), orient='index').reset_index()\n", "df_n = df_n.rename(columns={'index':'word', 0:'count'})\n", "\n", "df_p = pd.DataFrame.from_dict(Counter(big_bow_p), orient='index').reset_index()\n", "df_p = df_p.rename(columns={'index':'word', 0:'count'})" ] }, { "cell_type": "code", "execution_count": 98, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 Items (ALL) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 99, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_n.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS (TRUE) Prior to Cleaning\"))" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS (FALSE) Prior to Cleaning\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 4: TEST EXPERIMENTS!!" ] }, { "cell_type": "code", "execution_count": 229, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn import metrics\n", "from sklearn.metrics import confusion_matrix, classification_report\n", " \n", "# def cross_validation():\n", " \n", "def get_NB(small_df, labels):\n", " \n", " seeds = [109, 210, 420, 19, 7]\n", " for seed in seeds:\n", " x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = seed)\n", "\n", " gnb = GaussianNB()\n", " gnb.fit(x_train, y_train)\n", " y_pred = gnb.predict(x_test)\n", " print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))\n", "# print(\"The accuracy is\", accuracy)\n", " cm = confusion_matrix(y_test, y_pred)\n", " # confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\n", " tn, fp, fn, tp = cm.ravel()\n", " print(tn, fp, fn, tp)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 1: Parts of speech frequency distribution" ] }, { "cell_type": "code", "execution_count": 230, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNVBDTOVBINPRP$DTJJRBMD...WDTPDTJJRWPJJSEXRBSNNPUHFW
PoN
N186.01.03.063.07.054.02.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
N286.02.03.0162.014.0194.01.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
N156.01.02.031.09.0108.02.0...NaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", "

3 rows × 31 columns

\n", "
" ], "text/plain": [ " NN VBD TO VB IN PRP$ DT JJ RB MD ... WDT PDT JJR WP \\\n", "PoN ... \n", "N 18 6.0 1.0 3.0 6 3.0 7.0 5 4.0 2.0 ... NaN NaN NaN NaN \n", "N 28 6.0 2.0 3.0 16 2.0 14.0 19 4.0 1.0 ... NaN NaN NaN NaN \n", "N 15 6.0 1.0 2.0 3 1.0 9.0 10 8.0 2.0 ... NaN NaN NaN NaN \n", "\n", " JJS EX RBS NNP UH FW \n", "PoN \n", "N NaN NaN NaN NaN NaN NaN \n", "N NaN NaN NaN NaN NaN NaN \n", "N NaN NaN NaN NaN NaN NaN \n", "\n", "[3 rows x 31 columns]" ] }, "execution_count": 230, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos_df = pd.DataFrame(all_df['pos_dict'].tolist(), all_df['PoN'])\n", "pos_df[:3]" ] }, { "cell_type": "code", "execution_count": 231, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNVBDTOVBINPRP$DTJJRBMD...WDTPDTJJRWPJJSEXRBSNNPUHFW
PoN
N18613637542...0000000000
N28623162141941...0000000000
N156123191082...0000000000
\n", "

3 rows × 31 columns

\n", "
" ], "text/plain": [ " NN VBD TO VB IN PRP$ DT JJ RB MD ... WDT PDT JJR WP JJS \\\n", "PoN ... \n", "N 18 6 1 3 6 3 7 5 4 2 ... 0 0 0 0 0 \n", "N 28 6 2 3 16 2 14 19 4 1 ... 0 0 0 0 0 \n", "N 15 6 1 2 3 1 9 10 8 2 ... 0 0 0 0 0 \n", "\n", " EX RBS NNP UH FW \n", "PoN \n", "N 0 0 0 0 0 \n", "N 0 0 0 0 0 \n", "N 0 0 0 0 0 \n", "\n", "[3 rows x 31 columns]" ] }, "execution_count": 231, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos_df = pos_df.fillna(0).astype(int)\n", "pos_df[:3]" ] }, { "cell_type": "code", "execution_count": 232, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5\n", "47 3 45 1\n", "Accuracy: 0.4895833333333333\n", "45 1 48 2\n", "Accuracy: 0.4791666666666667\n", "42 1 49 4\n", "Accuracy: 0.53125\n", "47 0 45 4\n", "Accuracy: 0.40625\n", "35 3 54 4\n" ] } ], "source": [ "get_NB(pos_df, pos_df.index)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 1b: Normalized parts of speech frequency distribution" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNVBDTOVBINPRP$DTJJRBMD...PDTJJRWPJJSEXRBSNNPUHFWtotal
PoN
N18613637542...00000000068
N28623162141941...000000000129
N156123191082...00000000069
\n", "

3 rows × 32 columns

\n", "
" ], "text/plain": [ " NN VBD TO VB IN PRP$ DT JJ RB MD ... PDT JJR WP JJS EX \\\n", "PoN ... \n", "N 18 6 1 3 6 3 7 5 4 2 ... 0 0 0 0 0 \n", "N 28 6 2 3 16 2 14 19 4 1 ... 0 0 0 0 0 \n", "N 15 6 1 2 3 1 9 10 8 2 ... 0 0 0 0 0 \n", "\n", " RBS NNP UH FW total \n", "PoN \n", "N 0 0 0 0 68 \n", "N 0 0 0 0 129 \n", "N 0 0 0 0 69 \n", "\n", "[3 rows x 32 columns]" ] }, "execution_count": 146, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos_df_norm = pos_df.copy()\n", "pos_df_norm = pos_df_norm.apply(lambda x: x/x.sum(), axis=1)\n", "pos_df_norm[:3]\n", "pos_df_norm[1:]\n", "test = pos_df.copy()\n", "test['total'] = test.sum(axis = 1)\n", "test[:3]" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NNVBDTOVBINPRP$DTJJRBMD...WDTPDTJJRWPJJSEXRBSNNPUHFW
PoN
N0.2647060.0882350.0147060.0441180.0882350.0441180.1029410.0735290.0588240.029412...0.00.00.00.00.00.00.00.00.00.0
N0.2170540.0465120.0155040.0232560.1240310.0155040.1085270.1472870.0310080.007752...0.00.00.00.00.00.00.00.00.00.0
N0.2173910.0869570.0144930.0289860.0434780.0144930.1304350.1449280.1159420.028986...0.00.00.00.00.00.00.00.00.00.0
\n", "

3 rows × 31 columns

\n", "
" ], "text/plain": [ " NN VBD TO VB IN PRP$ DT \\\n", "PoN \n", "N 0.264706 0.088235 0.014706 0.044118 0.088235 0.044118 0.102941 \n", "N 0.217054 0.046512 0.015504 0.023256 0.124031 0.015504 0.108527 \n", "N 0.217391 0.086957 0.014493 0.028986 0.043478 0.014493 0.130435 \n", "\n", " JJ RB MD ... WDT PDT JJR WP JJS EX RBS \\\n", "PoN ... \n", "N 0.073529 0.058824 0.029412 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "N 0.147287 0.031008 0.007752 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "N 0.144928 0.115942 0.028986 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", " NNP UH FW \n", "PoN \n", "N 0.0 0.0 0.0 \n", "N 0.0 0.0 0.0 \n", "N 0.0 0.0 0.0 \n", "\n", "[3 rows x 31 columns]" ] }, "execution_count": 147, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pos_df_norm[:3]" ] }, { "cell_type": "code", "execution_count": 190, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5104166666666666\n", "Accuracy: 0.5\n", "Accuracy: 0.4791666666666667\n", "Accuracy: 0.5208333333333334\n", "Accuracy: 0.3854166666666667\n" ] } ], "source": [ "get_NB(pos_df_norm, pos_df.index)" ] }, { "cell_type": "code", "execution_count": 191, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5625\n", "Accuracy: 0.65625\n", "Accuracy: 0.5625\n", "Accuracy: 0.5833333333333334\n", "Accuracy: 0.5416666666666666\n" ] } ], "source": [ "# small_df\n", "small_df = pos_df_norm.filter(['PRP', 'PRP$','NN'])\n", "get_NB(small_df, pos_df.index)" ] }, { "cell_type": "code", "execution_count": 192, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1968\n", "959\n", "1009\n", "0.4872967479674797\n", "0.5127032520325203\n" ] } ], "source": [ "pos_df_n = pos_df[pos_df.index == 'N']\n", "pos_df_p = pos_df[pos_df.index == 'P']\n", "print(pos_df['PRP'].sum())\n", "print(pos_df_n['PRP'].sum())\n", "print(pos_df_p['PRP'].sum())\n", "print(pos_df_n['PRP'].sum()/pos_df['PRP'].sum())\n", "print(pos_df_p['PRP'].sum()/pos_df['PRP'].sum())" ] }, { "cell_type": "code", "execution_count": 193, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "888\n", "518\n", "370\n" ] } ], "source": [ "pos_df_n = pos_df[pos_df.index == 'N']\n", "pos_df_p = pos_df[pos_df.index == 'P']\n", "print(pos_df['PRP$'].sum())\n", "print(pos_df_n['PRP$'].sum())\n", "print(pos_df_p['PRP$'].sum())" ] }, { "cell_type": "code", "execution_count": 194, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "13.405406541226437\n", "6.357665198600147\n", "7.04774134262629\n" ] } ], "source": [ "pos_df_n = pos_df_norm[pos_df_norm.index == 'N']\n", "pos_df_p = pos_df_norm[pos_df_norm.index == 'P']\n", "print(pos_df_norm['PRP'].sum())\n", "print(pos_df_n['PRP'].sum())\n", "print(pos_df_p['PRP'].sum())" ] }, { "cell_type": "code", "execution_count": 195, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.041891895441332615\n", "0.039735407491250915\n", "0.044048383391414314\n" ] } ], "source": [ "pos_df_n = pos_df_norm[pos_df_norm.index == 'N']\n", "pos_df_p = pos_df_norm[pos_df_norm.index == 'P']\n", "print(pos_df_norm['PRP'].mean())\n", "print(pos_df_n['PRP'].mean())\n", "print(pos_df_p['PRP'].mean())" ] }, { "cell_type": "code", "execution_count": 196, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0.019007323131444483\n", "0.022148272795477524\n", "0.015866373467411442\n" ] } ], "source": [ "pos_df_n = pos_df_norm[pos_df_norm.index == 'N']\n", "pos_df_p = pos_df_norm[pos_df_norm.index == 'P']\n", "print(pos_df_norm['PRP$'].mean())\n", "print(pos_df_n['PRP$'].mean())\n", "print(pos_df_p['PRP$'].mean())" ] }, { "cell_type": "code", "execution_count": 197, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 [(i, NN), (traveled, VBD), (to, TO), (chicago,...\n", "1 [(i, JJ), (stayed, VBD), (in, IN), (the, DT), ...\n", "2 [(this, DT), (hotel, NN), (was, VBD), (gorgeou...\n", "3 [(this, DT), (is, VBZ), (an, DT), (absolutely,...\n", "4 [(i, NN), (recently, RB), (traveled, VBD), (up...\n", " ... \n", "315 [(this, DT), (hotel, NN), (was, VBD), (not, RB...\n", "316 [(i, JJ), (stayed, VBD), (at, IN), (the, DT), ...\n", "317 [(we, PRP), (had, VBD), (a, DT), (reservation,...\n", "318 [(i, NN), (am, VBP), (staying, VBG), (here, RB...\n", "319 [(we, PRP), (enjoyed, VBD), (the, DT), (hotel,...\n", "Name: pos, Length: 320, dtype: object" ] }, "execution_count": 197, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df['pos']" ] }, { "cell_type": "code", "execution_count": 198, "metadata": {}, "outputs": [], "source": [ "all_df['pos_sent'] = all_df.apply(lambda x: [word[1] for word in x['pos']], axis=1)\n", "all_df['pos_sent_str'] = all_df.apply(lambda x: [' '.join(x['pos_sent'])], axis=1)\n", "all_df['pos_no_sw_sent'] = all_df.apply(lambda x: [word[1] for word in x['pos_no_sw']], axis=1)" ] }, { "cell_type": "code", "execution_count": 199, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "list" ] }, "execution_count": 199, "metadata": {}, "output_type": "execute_result" } ], "source": [ "type(all_df['pos_sent_str'][1])" ] }, { "cell_type": "code", "execution_count": 200, "metadata": {}, "outputs": [], "source": [ "all_df['pos_sent_bi'] = all_df.apply(lambda x: [b for l in x['pos_sent_str'] for b in zip(l.split(\" \")[:-1], l.split(\" \")[1:])], axis=1)\n", "# bigrams = [b for l in text for b in zip(l.split(\" \")[:-1], l.split(\" \")[1:])]" ] }, { "cell_type": "code", "execution_count": 201, "metadata": {}, "outputs": [], "source": [ "# all_df['pos_sent_tri'] = all_df.apply(lambda x: [b for l in x['pos_sent_str'] for b in zip(l.split(\" \")[:-1], l.split(\" \")[1:])], axis=1)\n" ] }, { "cell_type": "code", "execution_count": 202, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_sw...bow_no_swpos_sentpos_sent_strpos_no_sw_sentpos_sent_bibow_postrigramstrigrams_postrigrams_featstrigrams_feats_bow
0I traveled to Chicago with my husband for a ro...N[I traveled to Chicago with my husband for a r...6[i, traveled, to, chicago, with, my, husband, ...68[traveled, chicago, husband, romantic, weekend...40[i, travel, to, chicago, with, my, husband, fo...[travel, chicago, husband, romant, weekend, aw......{'traveled': 1, 'chicago': 2, 'husband': 1, 'r...[NN, VBD, TO, VB, IN, PRP$, NN, IN, DT, JJ, NN...[NN VBD TO VB IN PRP$ NN IN DT JJ NN RB PRP$ N...[VBN, JJ, NN, JJ, NN, RB, JJ, JJ, NN, NN, NN, ...[(NN, VBD), (VBD, TO), (TO, VB), (VB, IN), (IN...{('NN', 'VBD'): 4, ('VBD', 'TO'): 1, ('TO', 'V...[(i, traveled, to), (traveled, to, chicago), (...[(NN, VBD, TO), (VBD, TO, VB), (TO, VB, IN), (...[NN_VBD_TO, VBD_TO_VB, TO_VB_IN, VB_IN_PRP, IN...{'NN_VBD_TO': 1, 'VBD_TO_VB': 1, 'TO_VB_IN': 1...
1I stayed in the Sofitel Chicago Water Tower ho...N[I stayed in the Sofitel Chicago Water Tower h...6[i, stayed, in, the, sofitel, chicago, water, ...129[stayed, sofitel, chicago, water, tower, hotel...71[i, stay, in, the, sofitel, chicago, water, to...[stay, sofitel, chicago, water, tower, hotel, ......{'stayed': 1, 'sofitel': 1, 'chicago': 1, 'wat...[JJ, VBD, IN, DT, NN, NN, NN, NN, NN, IN, PRP$...[JJ VBD IN DT NN NN NN NN NN IN PRP$ NN CC CD ...[JJ, NN, NN, NN, NN, NN, NN, CD, NNS, JJ, NN, ...[(JJ, VBD), (VBD, IN), (IN, DT), (DT, NN), (NN...{('JJ', 'VBD'): 1, ('VBD', 'IN'): 1, ('IN', 'D...[(i, stayed, in), (stayed, in, the), (in, the,...[(JJ, VBD, IN), (VBD, IN, DT), (IN, DT, NN), (...[JJ_VBD_IN, VBD_IN_DT, IN_DT_NN, DT_NN_NN, NN_...{'JJ_VBD_IN': 1, 'VBD_IN_DT': 1, 'IN_DT_NN': 2...
2This hotel was gorgeous! I really enjoyed my s...N[This hotel was gorgeous!, I really enjoyed my...7[this, hotel, was, gorgeous, i, really, enjoye...69[hotel, gorgeous, really, enjoyed, stay, defin...36[thi, hotel, wa, gorgeou, i, realli, enjoy, my...[hotel, gorgeou, realli, enjoy, stay, definit,......{'hotel': 2, 'gorgeous': 1, 'really': 1, 'enjo...[DT, NN, VBD, JJ, JJ, RB, VBN, PRP$, NN, RB, C...[DT NN VBD JJ JJ RB VBN PRP$ NN RB CC MD RB VB...[NN, JJ, RB, VBN, NN, RB, VBG, JJ, NN, JJ, NN,...[(DT, NN), (NN, VBD), (VBD, JJ), (JJ, JJ), (JJ...{('DT', 'NN'): 8, ('NN', 'VBD'): 6, ('VBD', 'J...[(this, hotel, was), (hotel, was, gorgeous), (...[(DT, NN, VBD), (NN, VBD, JJ), (VBD, JJ, JJ), ...[DT_NN_VBD, NN_VBD_JJ, VBD_JJ_JJ, JJ_JJ_RB, JJ...{'DT_NN_VBD': 4, 'NN_VBD_JJ': 4, 'VBD_JJ_JJ': ...
3This is an absolutely exquisite hotel, at a gr...N[This is an absolutely exquisite hotel, at a g...6[this, is, an, absolutely, exquisite, hotel, a...110[absolutely, exquisite, hotel, great, location...52[thi, is, an, absolut, exquisit, hotel, at, a,...[absolut, exquisit, hotel, great, locat, boast......{'absolutely': 1, 'exquisite': 1, 'hotel': 3, ...[DT, VBZ, DT, RB, JJ, NN, IN, DT, JJ, NN, CC, ...[DT VBZ DT RB JJ NN IN DT JJ NN CC NN NN NNS N...[RB, JJ, NN, JJ, NN, VBG, NN, NNS, JJ, NN, RB,...[(DT, VBZ), (VBZ, DT), (DT, RB), (RB, JJ), (JJ...{('DT', 'VBZ'): 1, ('VBZ', 'DT'): 2, ('DT', 'R...[(this, is, an), (is, an, absolutely), (an, ab...[(DT, VBZ, DT), (VBZ, DT, RB), (DT, RB, JJ), (...[DT_VBZ_DT, VBZ_DT_RB, DT_RB_JJ, RB_JJ_NN, JJ_...{'DT_VBZ_DT': 1, 'VBZ_DT_RB': 1, 'DT_RB_JJ': 1...
\n", "

4 rows × 27 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 I traveled to Chicago with my husband for a ro... N \n", "1 I stayed in the Sofitel Chicago Water Tower ho... N \n", "2 This hotel was gorgeous! I really enjoyed my s... N \n", "3 This is an absolutely exquisite hotel, at a gr... N \n", "\n", " sentences num_sentences \\\n", "0 [I traveled to Chicago with my husband for a r... 6 \n", "1 [I stayed in the Sofitel Chicago Water Tower h... 6 \n", "2 [This hotel was gorgeous!, I really enjoyed my... 7 \n", "3 [This is an absolutely exquisite hotel, at a g... 6 \n", "\n", " tokens num_tokens \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... 68 \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... 129 \n", "2 [this, hotel, was, gorgeous, i, really, enjoye... 69 \n", "3 [this, is, an, absolutely, exquisite, hotel, a... 110 \n", "\n", " no_sw num_no_sw \\\n", "0 [traveled, chicago, husband, romantic, weekend... 40 \n", "1 [stayed, sofitel, chicago, water, tower, hotel... 71 \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... 36 \n", "3 [absolutely, exquisite, hotel, great, location... 52 \n", "\n", " stemmed \\\n", "0 [i, travel, to, chicago, with, my, husband, fo... \n", "1 [i, stay, in, the, sofitel, chicago, water, to... \n", "2 [thi, hotel, wa, gorgeou, i, realli, enjoy, my... \n", "3 [thi, is, an, absolut, exquisit, hotel, at, a,... \n", "\n", " stemmed_no_sw ... \\\n", "0 [travel, chicago, husband, romant, weekend, aw... ... \n", "1 [stay, sofitel, chicago, water, tower, hotel, ... ... \n", "2 [hotel, gorgeou, realli, enjoy, stay, definit,... ... \n", "3 [absolut, exquisit, hotel, great, locat, boast... ... \n", "\n", " bow_no_sw \\\n", "0 {'traveled': 1, 'chicago': 2, 'husband': 1, 'r... \n", "1 {'stayed': 1, 'sofitel': 1, 'chicago': 1, 'wat... \n", "2 {'hotel': 2, 'gorgeous': 1, 'really': 1, 'enjo... \n", "3 {'absolutely': 1, 'exquisite': 1, 'hotel': 3, ... \n", "\n", " pos_sent \\\n", "0 [NN, VBD, TO, VB, IN, PRP$, NN, IN, DT, JJ, NN... \n", "1 [JJ, VBD, IN, DT, NN, NN, NN, NN, NN, IN, PRP$... \n", "2 [DT, NN, VBD, JJ, JJ, RB, VBN, PRP$, NN, RB, C... \n", "3 [DT, VBZ, DT, RB, JJ, NN, IN, DT, JJ, NN, CC, ... \n", "\n", " pos_sent_str \\\n", "0 [NN VBD TO VB IN PRP$ NN IN DT JJ NN RB PRP$ N... \n", "1 [JJ VBD IN DT NN NN NN NN NN IN PRP$ NN CC CD ... \n", "2 [DT NN VBD JJ JJ RB VBN PRP$ NN RB CC MD RB VB... \n", "3 [DT VBZ DT RB JJ NN IN DT JJ NN CC NN NN NNS N... \n", "\n", " pos_no_sw_sent \\\n", "0 [VBN, JJ, NN, JJ, NN, RB, JJ, JJ, NN, NN, NN, ... \n", "1 [JJ, NN, NN, NN, NN, NN, NN, CD, NNS, JJ, NN, ... \n", "2 [NN, JJ, RB, VBN, NN, RB, VBG, JJ, NN, JJ, NN,... \n", "3 [RB, JJ, NN, JJ, NN, VBG, NN, NNS, JJ, NN, RB,... \n", "\n", " pos_sent_bi \\\n", "0 [(NN, VBD), (VBD, TO), (TO, VB), (VB, IN), (IN... \n", "1 [(JJ, VBD), (VBD, IN), (IN, DT), (DT, NN), (NN... \n", "2 [(DT, NN), (NN, VBD), (VBD, JJ), (JJ, JJ), (JJ... \n", "3 [(DT, VBZ), (VBZ, DT), (DT, RB), (RB, JJ), (JJ... \n", "\n", " bow_pos \\\n", "0 {('NN', 'VBD'): 4, ('VBD', 'TO'): 1, ('TO', 'V... \n", "1 {('JJ', 'VBD'): 1, ('VBD', 'IN'): 1, ('IN', 'D... \n", "2 {('DT', 'NN'): 8, ('NN', 'VBD'): 6, ('VBD', 'J... \n", "3 {('DT', 'VBZ'): 1, ('VBZ', 'DT'): 2, ('DT', 'R... \n", "\n", " trigrams \\\n", "0 [(i, traveled, to), (traveled, to, chicago), (... \n", "1 [(i, stayed, in), (stayed, in, the), (in, the,... \n", "2 [(this, hotel, was), (hotel, was, gorgeous), (... \n", "3 [(this, is, an), (is, an, absolutely), (an, ab... \n", "\n", " trigrams_pos \\\n", "0 [(NN, VBD, TO), (VBD, TO, VB), (TO, VB, IN), (... \n", "1 [(JJ, VBD, IN), (VBD, IN, DT), (IN, DT, NN), (... \n", "2 [(DT, NN, VBD), (NN, VBD, JJ), (VBD, JJ, JJ), ... \n", "3 [(DT, VBZ, DT), (VBZ, DT, RB), (DT, RB, JJ), (... \n", "\n", " trigrams_feats \\\n", "0 [NN_VBD_TO, VBD_TO_VB, TO_VB_IN, VB_IN_PRP, IN... \n", "1 [JJ_VBD_IN, VBD_IN_DT, IN_DT_NN, DT_NN_NN, NN_... \n", "2 [DT_NN_VBD, NN_VBD_JJ, VBD_JJ_JJ, JJ_JJ_RB, JJ... \n", "3 [DT_VBZ_DT, VBZ_DT_RB, DT_RB_JJ, RB_JJ_NN, JJ_... \n", "\n", " trigrams_feats_bow \n", "0 {'NN_VBD_TO': 1, 'VBD_TO_VB': 1, 'TO_VB_IN': 1... \n", "1 {'JJ_VBD_IN': 1, 'VBD_IN_DT': 1, 'IN_DT_NN': 2... \n", "2 {'DT_NN_VBD': 4, 'NN_VBD_JJ': 4, 'VBD_JJ_JJ': ... \n", "3 {'DT_VBZ_DT': 1, 'VBZ_DT_RB': 1, 'DT_RB_JJ': 1... \n", "\n", "[4 rows x 27 columns]" ] }, "execution_count": 202, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:4]" ] }, { "cell_type": "code", "execution_count": 203, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['JJ VBD IN DT NN NN NN NN NN IN PRP$ NN CC CD NNS JJ NN CC MD RB VB DT NN RB IN JJ NNS DT JJ NNS IN JJ JJ NN CC JJ NNS VBD DT JJ NN TO DT JJ JJ NNS CC NNS IN NN VBP VBN IN JJ NN NNS DT NNS VBD JJ NN CC IN NNS CC JJ NNS JJ IN JJ NN NN CC VB DT NN VBD JJ RB VBN CC VBN IN DT NNS DT NN NN VBD VBG CC DT NN IN PRP$ NN VBD JJ JJ VBN DT NN IN DT NN NN IN DT JJ NN IN TO DT NN IN VBG NN NN CC VBG RP NNS IN VBG PRP VB RB IN NN']" ] }, "execution_count": 203, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test = all_df['pos_sent_str'][1]\n", "test" ] }, { "cell_type": "code", "execution_count": 204, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[('NN', 'NNS'), ('NNS', 'VBP'), ('VBP', 'JJ'), ('JJ', 'JJ'), ('JJ', 'NN'), ('NN', 'NN'), ('NN', 'CC'), ('CC', 'JJ'), ('JJ', 'NN'), ('NN', 'JJ'), ('JJ', 'VBZ'), ('VBZ', 'DT'), ('DT', 'NN'), ('NN', 'VBZ'), ('VBZ', 'RB'), ('RB', 'JJ'), ('JJ', 'RB'), ('RB', 'CC'), ('CC', 'JJ'), ('JJ', 'VB'), ('VB', 'JJ'), ('JJ', 'TO'), ('TO', 'VB'), ('VB', 'DT'), ('DT', 'NN'), ('NN', 'TO'), ('TO', 'VB'), ('VB', 'NN'), ('NN', 'IN'), ('IN', 'RB'), ('RB', 'RB'), ('RB', 'IN'), ('IN', 'PRP'), ('PRP', 'VBP'), ('VBP', 'VBN'), ('VBN', 'IN'), ('IN', 'DT'), ('DT', 'JJ'), ('JJ', 'NNS'), ('NNS', 'DT'), ('DT', 'NN'), ('NN', 'VBP'), ('VBP', 'JJ'), ('JJ', 'IN'), ('IN', 'PRP$'), ('PRP$', 'NNS'), ('NNS', 'CC'), ('CC', 'NN'), ('NN', 'TO'), ('TO', 'VB'), ('VB', 'NN'), ('NN', 'NN')]\n" ] } ], "source": [ "text = [\"this is a sentence\", \"so is this one\"]\n", "test2 = [\"NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN VBZ RB JJ RB CC JJ VB JJ TO VB DT NN TO VB NN IN RB RB IN\", \"PRP VBP VBN IN DT JJ NNS DT NN VBP JJ IN PRP$ NNS CC NN TO VB NN NN\"]\n", "test1 = ['NN NNS VBP JJ JJ NN NN CC JJ NN JJ VBZ DT NN VBZ RB JJ RB CC JJ VB JJ TO VB DT NN TO VB NN IN RB RB IN PRP VBP VBN IN DT JJ NNS DT NN VBP JJ IN PRP$ NNS CC NN TO VB NN NN']\n", "bigrams = [b for l in test1 for b in zip(l.split(\" \")[:-1], l.split(\" \")[1:])]\n", "print(bigrams)" ] }, { "cell_type": "code", "execution_count": 205, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[('NN', 'VBD'), ('VBD', 'TO'), ('TO', 'VB'), ('VB', 'IN'), ('IN', 'PRP$')]" ] }, "execution_count": 205, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# all_bigrams = [bigram for bigram in all_df.pos_sent_bi.tolist()]\n", "# flat_list = [item for sublist in l for item in sublist]\n", "all_df_n = all_df[all_df['PoN'] == 'N']\n", "all_df_p = all_df[all_df['PoN'] == 'P']\n", "all_bigrams = [bigram for sublist in all_df.pos_sent_bi.tolist() for bigram in sublist]\n", "all_bigrams_n = [bigram for sublist in all_df_n.pos_sent_bi.tolist() for bigram in sublist]\n", "all_bigrams_p = [bigram for sublist in all_df_p.pos_sent_bi.tolist() for bigram in sublist]\n", "all_bigrams[:5]" ] }, { "cell_type": "code", "execution_count": 206, "metadata": {}, "outputs": [], "source": [ "count = Counter(all_bigrams)\n", "count_n = Counter(all_bigrams_n)\n", "count_p = Counter(all_bigrams_p)" ] }, { "cell_type": "code", "execution_count": 207, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['TO', 'VB'], dtype='\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS Bigrams (ALL)\"))" ] }, { "cell_type": "code", "execution_count": 210, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_n.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS Bigrams (TRUE)\"))" ] }, { "cell_type": "code", "execution_count": 211, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n" ] }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "print(bar_plot(df_p.sort_values(by=[\"count\"], ascending=False)[:10], \"Top 10 POS Bigrams (FALSE)\"))" ] }, { "cell_type": "code", "execution_count": 212, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
(NN, VBD)(VBD, TO)(TO, VB)(VB, IN)(IN, PRP$)(PRP$, NN)(NN, IN)(IN, DT)(DT, JJ)(JJ, NN)...(RBS, PRP)(PRP, JJR)(JJ, RBS)(RBS, RB)(PRP$, TO)(WRB, VBD)(CC, JJS)(MD, DT)(VBN, MD)(PRP, RBR)
PoN
N4111133211...0000000000
N3000226447...0000000000
N6000010211...0000000000
N3120225436...0000000000
N120843561048...0000000000
\n", "

5 rows × 553 columns

\n", "
" ], "text/plain": [ " (NN, VBD) (VBD, TO) (TO, VB) (VB, IN) (IN, PRP$) (PRP$, NN) \\\n", "PoN \n", "N 4 1 1 1 1 3 \n", "N 3 0 0 0 2 2 \n", "N 6 0 0 0 0 1 \n", "N 3 1 2 0 2 2 \n", "N 12 0 8 4 3 5 \n", "\n", " (NN, IN) (IN, DT) (DT, JJ) (JJ, NN) ... (RBS, PRP) (PRP, JJR) \\\n", "PoN ... \n", "N 3 2 1 1 ... 0 0 \n", "N 6 4 4 7 ... 0 0 \n", "N 0 2 1 1 ... 0 0 \n", "N 5 4 3 6 ... 0 0 \n", "N 6 10 4 8 ... 0 0 \n", "\n", " (JJ, RBS) (RBS, RB) (PRP$, TO) (WRB, VBD) (CC, JJS) (MD, DT) \\\n", "PoN \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "\n", " (VBN, MD) (PRP, RBR) \n", "PoN \n", "N 0 0 \n", "N 0 0 \n", "N 0 0 \n", "N 0 0 \n", "N 0 0 \n", "\n", "[5 rows x 553 columns]" ] }, "execution_count": 212, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df['bow_pos'] = all_df.apply(lambda x: Counter(x['pos_sent_bi']), axis=1)\n", "\n", "new_df = pd.DataFrame(all_df['bow_pos'].tolist(), all_df['PoN'])\n", "new_df = new_df.fillna(0).astype(int)\n", "new_df[:5]" ] }, { "cell_type": "code", "execution_count": 213, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5416666666666666\n", "Accuracy: 0.5208333333333334\n", "Accuracy: 0.4791666666666667\n", "Accuracy: 0.5208333333333334\n", "Accuracy: 0.5833333333333334\n" ] } ], "source": [ "get_NB(new_df, new_df.index)" ] }, { "cell_type": "code", "execution_count": 214, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5625\n", "Accuracy: 0.5625\n", "Accuracy: 0.46875\n", "Accuracy: 0.4895833333333333\n", "Accuracy: 0.59375\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
(NN, VBD)(VBD, TO)(TO, VB)(VB, IN)(IN, PRP$)(PRP$, NN)(NN, IN)(IN, DT)(DT, JJ)(JJ, NN)...(RBS, PRP)(PRP, JJR)(JJ, RBS)(RBS, RB)(PRP$, TO)(WRB, VBD)(CC, JJS)(MD, DT)(VBN, MD)(PRP, RBR)
PoN
N0.0597010.0149250.0149250.0149250.0149250.0447760.0447760.0298510.0149250.014925...0.00.00.00.00.00.00.00.00.00.000000
N0.0234380.0000000.0000000.0000000.0156250.0156250.0468750.0312500.0312500.054688...0.00.00.00.00.00.00.00.00.00.000000
N0.0882350.0000000.0000000.0000000.0000000.0147060.0000000.0294120.0147060.014706...0.00.00.00.00.00.00.00.00.00.000000
N0.0275230.0091740.0183490.0000000.0183490.0183490.0458720.0366970.0275230.055046...0.00.00.00.00.00.00.00.00.00.000000
N0.0468750.0000000.0312500.0156250.0117190.0195310.0234380.0390620.0156250.031250...0.00.00.00.00.00.00.00.00.00.000000
..................................................................
P0.0655740.0000000.0000000.0000000.0000000.0000000.0327870.0655740.0163930.000000...0.00.00.00.00.00.00.00.00.00.000000
P0.0137930.0000000.0068970.0000000.0137930.0068970.0758620.0620690.0413790.048276...0.00.00.00.00.00.00.00.00.00.000000
P0.0000000.0000000.0076340.0000000.0076340.0076340.0610690.0610690.0229010.053435...0.00.00.00.00.00.00.00.00.00.007634
P0.0322580.0000000.0129030.0000000.0064520.0064520.0645160.0516130.0322580.025806...0.00.00.00.00.00.00.00.00.00.000000
P0.0294120.0000000.0294120.0000000.0000000.0000000.0294120.0000000.0294120.058824...0.00.00.00.00.00.00.00.00.00.000000
\n", "

320 rows × 553 columns

\n", "
" ], "text/plain": [ " (NN, VBD) (VBD, TO) (TO, VB) (VB, IN) (IN, PRP$) (PRP$, NN) \\\n", "PoN \n", "N 0.059701 0.014925 0.014925 0.014925 0.014925 0.044776 \n", "N 0.023438 0.000000 0.000000 0.000000 0.015625 0.015625 \n", "N 0.088235 0.000000 0.000000 0.000000 0.000000 0.014706 \n", "N 0.027523 0.009174 0.018349 0.000000 0.018349 0.018349 \n", "N 0.046875 0.000000 0.031250 0.015625 0.011719 0.019531 \n", ".. ... ... ... ... ... ... \n", "P 0.065574 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "P 0.013793 0.000000 0.006897 0.000000 0.013793 0.006897 \n", "P 0.000000 0.000000 0.007634 0.000000 0.007634 0.007634 \n", "P 0.032258 0.000000 0.012903 0.000000 0.006452 0.006452 \n", "P 0.029412 0.000000 0.029412 0.000000 0.000000 0.000000 \n", "\n", " (NN, IN) (IN, DT) (DT, JJ) (JJ, NN) ... (RBS, PRP) (PRP, JJR) \\\n", "PoN ... \n", "N 0.044776 0.029851 0.014925 0.014925 ... 0.0 0.0 \n", "N 0.046875 0.031250 0.031250 0.054688 ... 0.0 0.0 \n", "N 0.000000 0.029412 0.014706 0.014706 ... 0.0 0.0 \n", "N 0.045872 0.036697 0.027523 0.055046 ... 0.0 0.0 \n", "N 0.023438 0.039062 0.015625 0.031250 ... 0.0 0.0 \n", ".. ... ... ... ... ... ... ... \n", "P 0.032787 0.065574 0.016393 0.000000 ... 0.0 0.0 \n", "P 0.075862 0.062069 0.041379 0.048276 ... 0.0 0.0 \n", "P 0.061069 0.061069 0.022901 0.053435 ... 0.0 0.0 \n", "P 0.064516 0.051613 0.032258 0.025806 ... 0.0 0.0 \n", "P 0.029412 0.000000 0.029412 0.058824 ... 0.0 0.0 \n", "\n", " (JJ, RBS) (RBS, RB) (PRP$, TO) (WRB, VBD) (CC, JJS) (MD, DT) \\\n", "PoN \n", "N 0.0 0.0 0.0 0.0 0.0 0.0 \n", "N 0.0 0.0 0.0 0.0 0.0 0.0 \n", "N 0.0 0.0 0.0 0.0 0.0 0.0 \n", "N 0.0 0.0 0.0 0.0 0.0 0.0 \n", "N 0.0 0.0 0.0 0.0 0.0 0.0 \n", ".. ... ... ... ... ... ... \n", "P 0.0 0.0 0.0 0.0 0.0 0.0 \n", "P 0.0 0.0 0.0 0.0 0.0 0.0 \n", "P 0.0 0.0 0.0 0.0 0.0 0.0 \n", "P 0.0 0.0 0.0 0.0 0.0 0.0 \n", "P 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", " (VBN, MD) (PRP, RBR) \n", "PoN \n", "N 0.0 0.000000 \n", "N 0.0 0.000000 \n", "N 0.0 0.000000 \n", "N 0.0 0.000000 \n", "N 0.0 0.000000 \n", ".. ... ... \n", "P 0.0 0.000000 \n", "P 0.0 0.000000 \n", "P 0.0 0.007634 \n", "P 0.0 0.000000 \n", "P 0.0 0.000000 \n", "\n", "[320 rows x 553 columns]" ] }, "execution_count": 214, "metadata": {}, "output_type": "execute_result" } ], "source": [ "bi_df_norm = new_df.copy()\n", "bi_df_norm = bi_df_norm.apply(lambda x: x/x.sum(), axis=1)\n", "# bi_df_norm = bi_df_norm.apply(lambda x: x/x.sum(), axis=1)\n", "bi_df_norm\n", "\n", "get_NB(bi_df_norm, bi_df_norm.index)\n", "bi_df_norm" ] }, { "cell_type": "code", "execution_count": 215, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
1(DT, NN)1719
14(IN, DT)1051
3(NN, IN)916
2(NN, NN)897
8(JJ, NN)859
6(NN, VBD)676
7(DT, JJ)540
30(PRP, VBD)458
21(NN, CC)451
9(IN, NN)398
25(TO, VB)374
0(VBD, DT)332
32(RB, JJ)308
147(VBD, RB)281
5(PRP$, NN)251
17(VBD, JJ)250
13(NNS, IN)233
158(NN, RB)229
36(IN, PRP)229
38(NN, DT)214
\n", "
" ], "text/plain": [ " word count\n", "1 (DT, NN) 1719\n", "14 (IN, DT) 1051\n", "3 (NN, IN) 916\n", "2 (NN, NN) 897\n", "8 (JJ, NN) 859\n", "6 (NN, VBD) 676\n", "7 (DT, JJ) 540\n", "30 (PRP, VBD) 458\n", "21 (NN, CC) 451\n", "9 (IN, NN) 398\n", "25 (TO, VB) 374\n", "0 (VBD, DT) 332\n", "32 (RB, JJ) 308\n", "147 (VBD, RB) 281\n", "5 (PRP$, NN) 251\n", "17 (VBD, JJ) 250\n", "13 (NNS, IN) 233\n", "158 (NN, RB) 229\n", "36 (IN, PRP) 229\n", "38 (NN, DT) 214" ] }, "execution_count": 215, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_p.sort_values(by=[\"count\"], ascending=False)[:20]" ] }, { "cell_type": "code", "execution_count": 216, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
wordcount
12(DT, NN)1519
7(IN, DT)1045
6(NN, IN)877
13(NN, NN)836
9(JJ, NN)782
0(NN, VBD)743
8(DT, JJ)529
27(PRP, VBD)438
2(TO, VB)435
49(NN, CC)423
61(IN, NN)393
5(PRP$, NN)377
28(VBD, RB)337
57(VBD, DT)292
90(RB, JJ)284
10(NN, RB)241
41(DT, NNS)240
4(IN, PRP$)226
122(IN, PRP)213
43(VBD, IN)209
\n", "
" ], "text/plain": [ " word count\n", "12 (DT, NN) 1519\n", "7 (IN, DT) 1045\n", "6 (NN, IN) 877\n", "13 (NN, NN) 836\n", "9 (JJ, NN) 782\n", "0 (NN, VBD) 743\n", "8 (DT, JJ) 529\n", "27 (PRP, VBD) 438\n", "2 (TO, VB) 435\n", "49 (NN, CC) 423\n", "61 (IN, NN) 393\n", "5 (PRP$, NN) 377\n", "28 (VBD, RB) 337\n", "57 (VBD, DT) 292\n", "90 (RB, JJ) 284\n", "10 (NN, RB) 241\n", "41 (DT, NNS) 240\n", "4 (IN, PRP$) 226\n", "122 (IN, PRP) 213\n", "43 (VBD, IN) 209" ] }, "execution_count": 216, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_n.sort_values(by=[\"count\"], ascending=False)[:20]" ] }, { "cell_type": "code", "execution_count": 217, "metadata": {}, "outputs": [], "source": [ "from nltk import word_tokenize \n", "from nltk.util import ngrams\n", "\n", "text = ['cant railway station', 'citadel hotel', 'police stn']\n", "def get_ngram(line, num):\n", " token = nltk.word_tokenize(line)\n", " grams = list(ngrams(token, num)) \n", " return(grams)\n", "\n", "# all_df['trigrams'] = all_df.apply(lambda x: get_ngram(x[0],3), axis=1)\n", "all_df['trigrams'] = all_df.apply(lambda x: get_ngram(' '.join(x['tokens']),3), axis=1)\n", "all_df['trigrams_pos'] = all_df.apply(lambda x: get_ngram(' '.join(x['pos_sent']),3), axis=1)\n", "\n", "# ' '.join(all_df['tokens'][1])\n", " \n", "# counter = all_df['trigrams_pos']" ] }, { "cell_type": "code", "execution_count": 218, "metadata": {}, "outputs": [], "source": [ "all_df['trigrams_feats'] = all_df.apply(lambda x: ['_'.join(trigram) for trigram in x['trigrams_pos']], axis=1)" ] }, { "cell_type": "code", "execution_count": 219, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array(['IN_PRP_$'], dtype='\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NN_VBD_TOVBD_TO_VBTO_VB_INVB_IN_PRPIN_PRP_$PRP_$_NN$_NN_INNN_IN_DTIN_DT_JJDT_JJ_NN...VBP_VBN_PDTWP_VBD_NNSVBZ_VBD_NNDT_WRB_PRPVBD_PRP_RBRPRP_RBR_INJJ_CC_CDVBN_VBN_RPNN_PRP_VBNPRP_VBN_RB
PoN
N1111132211...0000000000
N0000220212...0000000000
N0000010011...0000000000
N0000221123...0000000000
N0030350224...0000000000
\n", "

5 rows × 3696 columns

\n", "" ], "text/plain": [ " NN_VBD_TO VBD_TO_VB TO_VB_IN VB_IN_PRP IN_PRP_$ PRP_$_NN $_NN_IN \\\n", "PoN \n", "N 1 1 1 1 1 3 2 \n", "N 0 0 0 0 2 2 0 \n", "N 0 0 0 0 0 1 0 \n", "N 0 0 0 0 2 2 1 \n", "N 0 0 3 0 3 5 0 \n", "\n", " NN_IN_DT IN_DT_JJ DT_JJ_NN ... VBP_VBN_PDT WP_VBD_NNS VBZ_VBD_NN \\\n", "PoN ... \n", "N 2 1 1 ... 0 0 0 \n", "N 2 1 2 ... 0 0 0 \n", "N 0 1 1 ... 0 0 0 \n", "N 1 2 3 ... 0 0 0 \n", "N 2 2 4 ... 0 0 0 \n", "\n", " DT_WRB_PRP VBD_PRP_RBR PRP_RBR_IN JJ_CC_CD VBN_VBN_RP NN_PRP_VBN \\\n", "PoN \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "N 0 0 0 0 0 0 \n", "\n", " PRP_VBN_RB \n", "PoN \n", "N 0 \n", "N 0 \n", "N 0 \n", "N 0 \n", "N 0 \n", "\n", "[5 rows x 3696 columns]" ] }, "execution_count": 221, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_df = pd.DataFrame(all_df['trigrams_feats_bow'].tolist(), all_df['PoN'])\n", "new_df = new_df.fillna(0).astype(int)\n", "new_df[:5]\n" ] }, { "cell_type": "code", "execution_count": 222, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5520833333333334\n", "Accuracy: 0.6145833333333334\n", "Accuracy: 0.5520833333333334\n", "Accuracy: 0.5729166666666666\n", "Accuracy: 0.5104166666666666\n" ] } ], "source": [ "get_NB(new_df, new_df.index)" ] }, { "cell_type": "code", "execution_count": 223, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.53125\n", "Accuracy: 0.6041666666666666\n", "Accuracy: 0.4791666666666667\n", "Accuracy: 0.5833333333333334\n", "Accuracy: 0.5208333333333334\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NN_VBD_TOVBD_TO_VBTO_VB_INVB_IN_PRPIN_PRP_$PRP_$_NN$_NN_INNN_IN_DTIN_DT_JJDT_JJ_NN...VBP_VBN_PDTWP_VBD_NNSVBZ_VBD_NNDT_WRB_PRPVBD_PRP_RBRPRP_RBR_INJJ_CC_CDVBN_VBN_RPNN_PRP_VBNPRP_VBN_RB
PoN
N0.0144930.0144930.0144930.0144930.0144930.0434780.0289860.0289860.0144930.014493...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
N0.0000000.0000000.0000000.0000000.0155040.0155040.0000000.0155040.0077520.015504...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
N0.0000000.0000000.0000000.0000000.0000000.0147060.0000000.0000000.0147060.014706...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
N0.0000000.0000000.0000000.0000000.0178570.0178570.0089290.0089290.0178570.026786...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
N0.0000000.0000000.0114940.0000000.0114940.0191570.0000000.0076630.0076630.015326...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
..................................................................
P0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0166670.0000000.000000...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
P0.0000000.0000000.0000000.0000000.0135140.0067570.0000000.0270270.0270270.027027...0.0067570.0067570.0067570.0067570.0000000.0000000.0000000.0000000.0000000.000000
P0.0000000.0000000.0000000.0000000.0076340.0076340.0000000.0305340.0076340.015267...0.0000000.0000000.0000000.0000000.0076340.0076340.0000000.0000000.0000000.000000
P0.0000000.0000000.0000000.0000000.0064520.0064520.0000000.0258060.0193550.019355...0.0000000.0000000.0000000.0000000.0000000.0000000.0064520.0064520.0064520.006452
P0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.030303...0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
\n", "

320 rows × 3696 columns

\n", "
" ], "text/plain": [ " NN_VBD_TO VBD_TO_VB TO_VB_IN VB_IN_PRP IN_PRP_$ PRP_$_NN $_NN_IN \\\n", "PoN \n", "N 0.014493 0.014493 0.014493 0.014493 0.014493 0.043478 0.028986 \n", "N 0.000000 0.000000 0.000000 0.000000 0.015504 0.015504 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.000000 0.014706 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.017857 0.017857 0.008929 \n", "N 0.000000 0.000000 0.011494 0.000000 0.011494 0.019157 0.000000 \n", ".. ... ... ... ... ... ... ... \n", "P 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "P 0.000000 0.000000 0.000000 0.000000 0.013514 0.006757 0.000000 \n", "P 0.000000 0.000000 0.000000 0.000000 0.007634 0.007634 0.000000 \n", "P 0.000000 0.000000 0.000000 0.000000 0.006452 0.006452 0.000000 \n", "P 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "\n", " NN_IN_DT IN_DT_JJ DT_JJ_NN ... VBP_VBN_PDT WP_VBD_NNS VBZ_VBD_NN \\\n", "PoN ... \n", "N 0.028986 0.014493 0.014493 ... 0.000000 0.000000 0.000000 \n", "N 0.015504 0.007752 0.015504 ... 0.000000 0.000000 0.000000 \n", "N 0.000000 0.014706 0.014706 ... 0.000000 0.000000 0.000000 \n", "N 0.008929 0.017857 0.026786 ... 0.000000 0.000000 0.000000 \n", "N 0.007663 0.007663 0.015326 ... 0.000000 0.000000 0.000000 \n", ".. ... ... ... ... ... ... ... \n", "P 0.016667 0.000000 0.000000 ... 0.000000 0.000000 0.000000 \n", "P 0.027027 0.027027 0.027027 ... 0.006757 0.006757 0.006757 \n", "P 0.030534 0.007634 0.015267 ... 0.000000 0.000000 0.000000 \n", "P 0.025806 0.019355 0.019355 ... 0.000000 0.000000 0.000000 \n", "P 0.000000 0.000000 0.030303 ... 0.000000 0.000000 0.000000 \n", "\n", " DT_WRB_PRP VBD_PRP_RBR PRP_RBR_IN JJ_CC_CD VBN_VBN_RP NN_PRP_VBN \\\n", "PoN \n", "N 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "N 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", ".. ... ... ... ... ... ... \n", "P 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "P 0.006757 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "P 0.000000 0.007634 0.007634 0.000000 0.000000 0.000000 \n", "P 0.000000 0.000000 0.000000 0.006452 0.006452 0.006452 \n", "P 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "\n", " PRP_VBN_RB \n", "PoN \n", "N 0.000000 \n", "N 0.000000 \n", "N 0.000000 \n", "N 0.000000 \n", "N 0.000000 \n", ".. ... \n", "P 0.000000 \n", "P 0.000000 \n", "P 0.000000 \n", "P 0.006452 \n", "P 0.000000 \n", "\n", "[320 rows x 3696 columns]" ] }, "execution_count": 223, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tri_df_norm = new_df.copy()\n", "tri_df_norm = tri_df_norm.apply(lambda x: x/x.sum(), axis=1)\n", "tri_df_norm\n", "\n", "get_NB(tri_df_norm, tri_df_norm.index)\n", "tri_df_norm" ] }, { "cell_type": "code", "execution_count": 224, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swstemmedstemmed_no_sw...bow_no_swpos_sentpos_sent_strpos_no_sw_sentpos_sent_bibow_postrigramstrigrams_postrigrams_featstrigrams_feats_bow
0I traveled to Chicago with my husband for a ro...N[I traveled to Chicago with my husband for a r...6[i, traveled, to, chicago, with, my, husband, ...68[traveled, chicago, husband, romantic, weekend...40[i, travel, to, chicago, with, my, husband, fo...[travel, chicago, husband, romant, weekend, aw......{'traveled': 1, 'chicago': 2, 'husband': 1, 'r...[NN, VBD, TO, VB, IN, PRP$, NN, IN, DT, JJ, NN...[NN VBD TO VB IN PRP$ NN IN DT JJ NN RB PRP$ N...[VBN, JJ, NN, JJ, NN, RB, JJ, JJ, NN, NN, NN, ...[(NN, VBD), (VBD, TO), (TO, VB), (VB, IN), (IN...{('NN', 'VBD'): 4, ('VBD', 'TO'): 1, ('TO', 'V...[(i, traveled, to), (traveled, to, chicago), (...[(NN, VBD, TO), (VBD, TO, VB), (TO, VB, IN), (...[NN_VBD_TO, VBD_TO_VB, TO_VB_IN, VB_IN_PRP, IN...{'NN_VBD_TO': 1, 'VBD_TO_VB': 1, 'TO_VB_IN': 1...
1I stayed in the Sofitel Chicago Water Tower ho...N[I stayed in the Sofitel Chicago Water Tower h...6[i, stayed, in, the, sofitel, chicago, water, ...129[stayed, sofitel, chicago, water, tower, hotel...71[i, stay, in, the, sofitel, chicago, water, to...[stay, sofitel, chicago, water, tower, hotel, ......{'stayed': 1, 'sofitel': 1, 'chicago': 1, 'wat...[JJ, VBD, IN, DT, NN, NN, NN, NN, NN, IN, PRP$...[JJ VBD IN DT NN NN NN NN NN IN PRP$ NN CC CD ...[JJ, NN, NN, NN, NN, NN, NN, CD, NNS, JJ, NN, ...[(JJ, VBD), (VBD, IN), (IN, DT), (DT, NN), (NN...{('JJ', 'VBD'): 1, ('VBD', 'IN'): 1, ('IN', 'D...[(i, stayed, in), (stayed, in, the), (in, the,...[(JJ, VBD, IN), (VBD, IN, DT), (IN, DT, NN), (...[JJ_VBD_IN, VBD_IN_DT, IN_DT_NN, DT_NN_NN, NN_...{'JJ_VBD_IN': 1, 'VBD_IN_DT': 1, 'IN_DT_NN': 2...
2This hotel was gorgeous! I really enjoyed my s...N[This hotel was gorgeous!, I really enjoyed my...7[this, hotel, was, gorgeous, i, really, enjoye...69[hotel, gorgeous, really, enjoyed, stay, defin...36[thi, hotel, wa, gorgeou, i, realli, enjoy, my...[hotel, gorgeou, realli, enjoy, stay, definit,......{'hotel': 2, 'gorgeous': 1, 'really': 1, 'enjo...[DT, NN, VBD, JJ, JJ, RB, VBN, PRP$, NN, RB, C...[DT NN VBD JJ JJ RB VBN PRP$ NN RB CC MD RB VB...[NN, JJ, RB, VBN, NN, RB, VBG, JJ, NN, JJ, NN,...[(DT, NN), (NN, VBD), (VBD, JJ), (JJ, JJ), (JJ...{('DT', 'NN'): 8, ('NN', 'VBD'): 6, ('VBD', 'J...[(this, hotel, was), (hotel, was, gorgeous), (...[(DT, NN, VBD), (NN, VBD, JJ), (VBD, JJ, JJ), ...[DT_NN_VBD, NN_VBD_JJ, VBD_JJ_JJ, JJ_JJ_RB, JJ...{'DT_NN_VBD': 4, 'NN_VBD_JJ': 4, 'VBD_JJ_JJ': ...
3This is an absolutely exquisite hotel, at a gr...N[This is an absolutely exquisite hotel, at a g...6[this, is, an, absolutely, exquisite, hotel, a...110[absolutely, exquisite, hotel, great, location...52[thi, is, an, absolut, exquisit, hotel, at, a,...[absolut, exquisit, hotel, great, locat, boast......{'absolutely': 1, 'exquisite': 1, 'hotel': 3, ...[DT, VBZ, DT, RB, JJ, NN, IN, DT, JJ, NN, CC, ...[DT VBZ DT RB JJ NN IN DT JJ NN CC NN NN NNS N...[RB, JJ, NN, JJ, NN, VBG, NN, NNS, JJ, NN, RB,...[(DT, VBZ), (VBZ, DT), (DT, RB), (RB, JJ), (JJ...{('DT', 'VBZ'): 1, ('VBZ', 'DT'): 2, ('DT', 'R...[(this, is, an), (is, an, absolutely), (an, ab...[(DT, VBZ, DT), (VBZ, DT, RB), (DT, RB, JJ), (...[DT_VBZ_DT, VBZ_DT_RB, DT_RB_JJ, RB_JJ_NN, JJ_...{'DT_VBZ_DT': 1, 'VBZ_DT_RB': 1, 'DT_RB_JJ': 1...
4I recently traveled up to Chicago for business...N[I recently traveled up to Chicago for busines...13[i, recently, traveled, up, to, chicago, for, ...257[recently, traveled, chicago, business, terrif...116[i, recent, travel, up, to, chicago, for, busi...[recent, travel, chicago, busi, terrif, day, n......{'recently': 1, 'traveled': 1, 'chicago': 4, '...[NN, RB, VBD, RP, TO, VB, IN, NN, CC, VBD, DT,...[NN RB VBD RP TO VB IN NN CC VBD DT JJ NN NN N...[RB, VBN, NN, NN, NN, NN, NN, VB, RB, JJ, NN, ...[(NN, RB), (RB, VBD), (VBD, RP), (RP, TO), (TO...{('NN', 'RB'): 3, ('RB', 'VBD'): 3, ('VBD', 'R...[(i, recently, traveled), (recently, traveled,...[(NN, RB, VBD), (RB, VBD, RP), (VBD, RP, TO), ...[NN_RB_VBD, RB_VBD_RP, VBD_RP_TO, RP_TO_VB, TO...{'NN_RB_VBD': 1, 'RB_VBD_RP': 2, 'VBD_RP_TO': ...
..................................................................
315This hotel was not worth it. From the moment w...P[This hotel was not worth it., From the moment...6[this, hotel, was, not, worth, it, from, the, ...62[hotel, worth, moment, walked, hotel, lobby, c...27[thi, hotel, wa, not, worth, it, from, the, mo...[hotel, worth, moment, walk, hotel, lobbi, che......{'hotel': 2, 'worth': 1, 'moment': 1, 'walked'...[DT, NN, VBD, RB, JJ, PRP, IN, DT, NN, PRP, VB...[DT NN VBD RB JJ PRP IN DT NN PRP VBD IN DT NN...[NN, JJ, NN, VBD, NN, NN, NN, NN, VBP, JJ, NNS...[(DT, NN), (NN, VBD), (VBD, RB), (RB, JJ), (JJ...{('DT', 'NN'): 7, ('NN', 'VBD'): 4, ('VBD', 'R...[(this, hotel, was), (hotel, was, not), (was, ...[(DT, NN, VBD), (NN, VBD, RB), (VBD, RB, JJ), ...[DT_NN_VBD, NN_VBD_RB, VBD_RB_JJ, RB_JJ_PRP, J...{'DT_NN_VBD': 3, 'NN_VBD_RB': 1, 'VBD_RB_JJ': ...
316I stayed at the hotel during the Dave Matthews...P[I stayed at the hotel during the Dave Matthew...9[i, stayed, at, the, hotel, during, the, dave,...146[stayed, hotel, dave, matthews, caravan, tour,...76[i, stay, at, the, hotel, dure, the, dave, mat...[stay, hotel, dave, matthew, caravan, tour, wo......{'stayed': 1, 'hotel': 3, 'dave': 1, 'matthews...[JJ, VBD, IN, DT, NN, IN, DT, NN, NNS, VBP, JJ...[JJ VBD IN DT NN IN DT NN NNS VBP JJ CC MD VB ...[JJ, NN, VBP, NNS, VB, NNS, MD, VB, NN, NN, RB...[(JJ, VBD), (VBD, IN), (IN, DT), (DT, NN), (NN...{('JJ', 'VBD'): 1, ('VBD', 'IN'): 1, ('IN', 'D...[(i, stayed, at), (stayed, at, the), (at, the,...[(JJ, VBD, IN), (VBD, IN, DT), (IN, DT, NN), (...[JJ_VBD_IN, VBD_IN_DT, IN_DT_NN, DT_NN_IN, NN_...{'JJ_VBD_IN': 1, 'VBD_IN_DT': 1, 'IN_DT_NN': 3...
317We had a reservation for 3 rooms with 5 adults...P[We had a reservation for 3 rooms with 5 adult...9[we, had, a, reservation, for, rooms, with, ad...132[reservation, rooms, adults, kids, got, rooms,...58[we, had, a, reserv, for, room, with, adult, a...[reserv, room, adult, kid, got, room, arriv, c......{'reservation': 1, 'rooms': 3, 'adults': 1, 'k...[PRP, VBD, DT, NN, IN, NNS, IN, NNS, CC, NNS, ...[PRP VBD DT NN IN NNS IN NNS CC NNS VBD RB NNS...[NN, NNS, NNS, NNS, VBD, NNS, JJ, NN, NNS, JJ,...[(PRP, VBD), (VBD, DT), (DT, NN), (NN, IN), (I...{('PRP', 'VBD'): 8, ('VBD', 'DT'): 1, ('DT', '...[(we, had, a), (had, a, reservation), (a, rese...[(PRP, VBD, DT), (VBD, DT, NN), (DT, NN, IN), ...[PRP_VBD_DT, VBD_DT_NN, DT_NN_IN, NN_IN_NNS, I...{'PRP_VBD_DT': 1, 'VBD_DT_NN': 1, 'DT_NN_IN': ...
318I am staying here now and actually am compelle...P[I am staying here now and actually am compell...6[i, am, staying, here, now, and, actually, am,...156[staying, actually, compelled, write, review, ...72[i, am, stay, here, now, and, actual, am, comp...[stay, actual, compel, write, review, fall, as......{'staying': 1, 'actually': 1, 'compelled': 1, ...[NN, VBP, VBG, RB, RB, CC, RB, VBP, VBN, TO, V...[NN VBP VBG RB RB CC RB VBP VBN TO VB DT NN IN...[VBG, RB, VBN, JJ, NN, NN, JJ, NN, JJ, NN, NN,...[(NN, VBP), (VBP, VBG), (VBG, RB), (RB, RB), (...{('NN', 'VBP'): 1, ('VBP', 'VBG'): 1, ('VBG', ...[(i, am, staying), (am, staying, here), (stayi...[(NN, VBP, VBG), (VBP, VBG, RB), (VBG, RB, RB)...[NN_VBP_VBG, VBP_VBG_RB, VBG_RB_RB, RB_RB_CC, ...{'NN_VBP_VBG': 1, 'VBP_VBG_RB': 1, 'VBG_RB_RB'...
319We enjoyed the Hotel Monaco. Great location fo...P[We enjoyed the Hotel Monaco., Great location ...4[we, enjoyed, the, hotel, monaco, great, locat...35[enjoyed, hotel, monaco, great, location, walk...19[we, enjoy, the, hotel, monaco, great, locat, ...[enjoy, hotel, monaco, great, locat, walk, bea......{'enjoyed': 2, 'hotel': 1, 'monaco': 1, 'great...[PRP, VBD, DT, NN, VBZ, JJ, NN, IN, NN, CC, NN...[PRP VBD DT NN VBZ JJ NN IN NN CC NN NNS DT NN...[JJ, NN, NN, JJ, NN, VBG, JJ, NNS, NN, RB, RB,...[(PRP, VBD), (VBD, DT), (DT, NN), (NN, VBZ), (...{('PRP', 'VBD'): 1, ('VBD', 'DT'): 2, ('DT', '...[(we, enjoyed, the), (enjoyed, the, hotel), (t...[(PRP, VBD, DT), (VBD, DT, NN), (DT, NN, VBZ),...[PRP_VBD_DT, VBD_DT_NN, DT_NN_VBZ, NN_VBZ_JJ, ...{'PRP_VBD_DT': 1, 'VBD_DT_NN': 1, 'DT_NN_VBZ':...
\n", "

320 rows × 27 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 I traveled to Chicago with my husband for a ro... N \n", "1 I stayed in the Sofitel Chicago Water Tower ho... N \n", "2 This hotel was gorgeous! I really enjoyed my s... N \n", "3 This is an absolutely exquisite hotel, at a gr... N \n", "4 I recently traveled up to Chicago for business... N \n", ".. ... .. \n", "315 This hotel was not worth it. From the moment w... P \n", "316 I stayed at the hotel during the Dave Matthews... P \n", "317 We had a reservation for 3 rooms with 5 adults... P \n", "318 I am staying here now and actually am compelle... P \n", "319 We enjoyed the Hotel Monaco. Great location fo... P \n", "\n", " sentences num_sentences \\\n", "0 [I traveled to Chicago with my husband for a r... 6 \n", "1 [I stayed in the Sofitel Chicago Water Tower h... 6 \n", "2 [This hotel was gorgeous!, I really enjoyed my... 7 \n", "3 [This is an absolutely exquisite hotel, at a g... 6 \n", "4 [I recently traveled up to Chicago for busines... 13 \n", ".. ... ... \n", "315 [This hotel was not worth it., From the moment... 6 \n", "316 [I stayed at the hotel during the Dave Matthew... 9 \n", "317 [We had a reservation for 3 rooms with 5 adult... 9 \n", "318 [I am staying here now and actually am compell... 6 \n", "319 [We enjoyed the Hotel Monaco., Great location ... 4 \n", "\n", " tokens num_tokens \\\n", "0 [i, traveled, to, chicago, with, my, husband, ... 68 \n", "1 [i, stayed, in, the, sofitel, chicago, water, ... 129 \n", "2 [this, hotel, was, gorgeous, i, really, enjoye... 69 \n", "3 [this, is, an, absolutely, exquisite, hotel, a... 110 \n", "4 [i, recently, traveled, up, to, chicago, for, ... 257 \n", ".. ... ... \n", "315 [this, hotel, was, not, worth, it, from, the, ... 62 \n", "316 [i, stayed, at, the, hotel, during, the, dave,... 146 \n", "317 [we, had, a, reservation, for, rooms, with, ad... 132 \n", "318 [i, am, staying, here, now, and, actually, am,... 156 \n", "319 [we, enjoyed, the, hotel, monaco, great, locat... 35 \n", "\n", " no_sw num_no_sw \\\n", "0 [traveled, chicago, husband, romantic, weekend... 40 \n", "1 [stayed, sofitel, chicago, water, tower, hotel... 71 \n", "2 [hotel, gorgeous, really, enjoyed, stay, defin... 36 \n", "3 [absolutely, exquisite, hotel, great, location... 52 \n", "4 [recently, traveled, chicago, business, terrif... 116 \n", ".. ... ... \n", "315 [hotel, worth, moment, walked, hotel, lobby, c... 27 \n", "316 [stayed, hotel, dave, matthews, caravan, tour,... 76 \n", "317 [reservation, rooms, adults, kids, got, rooms,... 58 \n", "318 [staying, actually, compelled, write, review, ... 72 \n", "319 [enjoyed, hotel, monaco, great, location, walk... 19 \n", "\n", " stemmed \\\n", "0 [i, travel, to, chicago, with, my, husband, fo... \n", "1 [i, stay, in, the, sofitel, chicago, water, to... \n", "2 [thi, hotel, wa, gorgeou, i, realli, enjoy, my... \n", "3 [thi, is, an, absolut, exquisit, hotel, at, a,... \n", "4 [i, recent, travel, up, to, chicago, for, busi... \n", ".. ... \n", "315 [thi, hotel, wa, not, worth, it, from, the, mo... \n", "316 [i, stay, at, the, hotel, dure, the, dave, mat... \n", "317 [we, had, a, reserv, for, room, with, adult, a... \n", "318 [i, am, stay, here, now, and, actual, am, comp... \n", "319 [we, enjoy, the, hotel, monaco, great, locat, ... \n", "\n", " stemmed_no_sw ... \\\n", "0 [travel, chicago, husband, romant, weekend, aw... ... \n", "1 [stay, sofitel, chicago, water, tower, hotel, ... ... \n", "2 [hotel, gorgeou, realli, enjoy, stay, definit,... ... \n", "3 [absolut, exquisit, hotel, great, locat, boast... ... \n", "4 [recent, travel, chicago, busi, terrif, day, n... ... \n", ".. ... ... \n", "315 [hotel, worth, moment, walk, hotel, lobbi, che... ... \n", "316 [stay, hotel, dave, matthew, caravan, tour, wo... ... \n", "317 [reserv, room, adult, kid, got, room, arriv, c... ... \n", "318 [stay, actual, compel, write, review, fall, as... ... \n", "319 [enjoy, hotel, monaco, great, locat, walk, bea... ... \n", "\n", " bow_no_sw \\\n", "0 {'traveled': 1, 'chicago': 2, 'husband': 1, 'r... \n", "1 {'stayed': 1, 'sofitel': 1, 'chicago': 1, 'wat... \n", "2 {'hotel': 2, 'gorgeous': 1, 'really': 1, 'enjo... \n", "3 {'absolutely': 1, 'exquisite': 1, 'hotel': 3, ... \n", "4 {'recently': 1, 'traveled': 1, 'chicago': 4, '... \n", ".. ... \n", "315 {'hotel': 2, 'worth': 1, 'moment': 1, 'walked'... \n", "316 {'stayed': 1, 'hotel': 3, 'dave': 1, 'matthews... \n", "317 {'reservation': 1, 'rooms': 3, 'adults': 1, 'k... \n", "318 {'staying': 1, 'actually': 1, 'compelled': 1, ... \n", "319 {'enjoyed': 2, 'hotel': 1, 'monaco': 1, 'great... \n", "\n", " pos_sent \\\n", "0 [NN, VBD, TO, VB, IN, PRP$, NN, IN, DT, JJ, NN... \n", "1 [JJ, VBD, IN, DT, NN, NN, NN, NN, NN, IN, PRP$... \n", "2 [DT, NN, VBD, JJ, JJ, RB, VBN, PRP$, NN, RB, C... \n", "3 [DT, VBZ, DT, RB, JJ, NN, IN, DT, JJ, NN, CC, ... \n", "4 [NN, RB, VBD, RP, TO, VB, IN, NN, CC, VBD, DT,... \n", ".. ... \n", "315 [DT, NN, VBD, RB, JJ, PRP, IN, DT, NN, PRP, VB... \n", "316 [JJ, VBD, IN, DT, NN, IN, DT, NN, NNS, VBP, JJ... \n", "317 [PRP, VBD, DT, NN, IN, NNS, IN, NNS, CC, NNS, ... \n", "318 [NN, VBP, VBG, RB, RB, CC, RB, VBP, VBN, TO, V... \n", "319 [PRP, VBD, DT, NN, VBZ, JJ, NN, IN, NN, CC, NN... \n", "\n", " pos_sent_str \\\n", "0 [NN VBD TO VB IN PRP$ NN IN DT JJ NN RB PRP$ N... \n", "1 [JJ VBD IN DT NN NN NN NN NN IN PRP$ NN CC CD ... \n", "2 [DT NN VBD JJ JJ RB VBN PRP$ NN RB CC MD RB VB... \n", "3 [DT VBZ DT RB JJ NN IN DT JJ NN CC NN NN NNS N... \n", "4 [NN RB VBD RP TO VB IN NN CC VBD DT JJ NN NN N... \n", ".. ... \n", "315 [DT NN VBD RB JJ PRP IN DT NN PRP VBD IN DT NN... \n", "316 [JJ VBD IN DT NN IN DT NN NNS VBP JJ CC MD VB ... \n", "317 [PRP VBD DT NN IN NNS IN NNS CC NNS VBD RB NNS... \n", "318 [NN VBP VBG RB RB CC RB VBP VBN TO VB DT NN IN... \n", "319 [PRP VBD DT NN VBZ JJ NN IN NN CC NN NNS DT NN... \n", "\n", " pos_no_sw_sent \\\n", "0 [VBN, JJ, NN, JJ, NN, RB, JJ, JJ, NN, NN, NN, ... \n", "1 [JJ, NN, NN, NN, NN, NN, NN, CD, NNS, JJ, NN, ... \n", "2 [NN, JJ, RB, VBN, NN, RB, VBG, JJ, NN, JJ, NN,... \n", "3 [RB, JJ, NN, JJ, NN, VBG, NN, NNS, JJ, NN, RB,... \n", "4 [RB, VBN, NN, NN, NN, NN, NN, VB, RB, JJ, NN, ... \n", ".. ... \n", "315 [NN, JJ, NN, VBD, NN, NN, NN, NN, VBP, JJ, NNS... \n", "316 [JJ, NN, VBP, NNS, VB, NNS, MD, VB, NN, NN, RB... \n", "317 [NN, NNS, NNS, NNS, VBD, NNS, JJ, NN, NNS, JJ,... \n", "318 [VBG, RB, VBN, JJ, NN, NN, JJ, NN, JJ, NN, NN,... \n", "319 [JJ, NN, NN, JJ, NN, VBG, JJ, NNS, NN, RB, RB,... \n", "\n", " pos_sent_bi \\\n", "0 [(NN, VBD), (VBD, TO), (TO, VB), (VB, IN), (IN... \n", "1 [(JJ, VBD), (VBD, IN), (IN, DT), (DT, NN), (NN... \n", "2 [(DT, NN), (NN, VBD), (VBD, JJ), (JJ, JJ), (JJ... \n", "3 [(DT, VBZ), (VBZ, DT), (DT, RB), (RB, JJ), (JJ... \n", "4 [(NN, RB), (RB, VBD), (VBD, RP), (RP, TO), (TO... \n", ".. ... \n", "315 [(DT, NN), (NN, VBD), (VBD, RB), (RB, JJ), (JJ... \n", "316 [(JJ, VBD), (VBD, IN), (IN, DT), (DT, NN), (NN... \n", "317 [(PRP, VBD), (VBD, DT), (DT, NN), (NN, IN), (I... \n", "318 [(NN, VBP), (VBP, VBG), (VBG, RB), (RB, RB), (... \n", "319 [(PRP, VBD), (VBD, DT), (DT, NN), (NN, VBZ), (... \n", "\n", " bow_pos \\\n", "0 {('NN', 'VBD'): 4, ('VBD', 'TO'): 1, ('TO', 'V... \n", "1 {('JJ', 'VBD'): 1, ('VBD', 'IN'): 1, ('IN', 'D... \n", "2 {('DT', 'NN'): 8, ('NN', 'VBD'): 6, ('VBD', 'J... \n", "3 {('DT', 'VBZ'): 1, ('VBZ', 'DT'): 2, ('DT', 'R... \n", "4 {('NN', 'RB'): 3, ('RB', 'VBD'): 3, ('VBD', 'R... \n", ".. ... \n", "315 {('DT', 'NN'): 7, ('NN', 'VBD'): 4, ('VBD', 'R... \n", "316 {('JJ', 'VBD'): 1, ('VBD', 'IN'): 1, ('IN', 'D... \n", "317 {('PRP', 'VBD'): 8, ('VBD', 'DT'): 1, ('DT', '... \n", "318 {('NN', 'VBP'): 1, ('VBP', 'VBG'): 1, ('VBG', ... \n", "319 {('PRP', 'VBD'): 1, ('VBD', 'DT'): 2, ('DT', '... \n", "\n", " trigrams \\\n", "0 [(i, traveled, to), (traveled, to, chicago), (... \n", "1 [(i, stayed, in), (stayed, in, the), (in, the,... \n", "2 [(this, hotel, was), (hotel, was, gorgeous), (... \n", "3 [(this, is, an), (is, an, absolutely), (an, ab... \n", "4 [(i, recently, traveled), (recently, traveled,... \n", ".. ... \n", "315 [(this, hotel, was), (hotel, was, not), (was, ... \n", "316 [(i, stayed, at), (stayed, at, the), (at, the,... \n", "317 [(we, had, a), (had, a, reservation), (a, rese... \n", "318 [(i, am, staying), (am, staying, here), (stayi... \n", "319 [(we, enjoyed, the), (enjoyed, the, hotel), (t... \n", "\n", " trigrams_pos \\\n", "0 [(NN, VBD, TO), (VBD, TO, VB), (TO, VB, IN), (... \n", "1 [(JJ, VBD, IN), (VBD, IN, DT), (IN, DT, NN), (... \n", "2 [(DT, NN, VBD), (NN, VBD, JJ), (VBD, JJ, JJ), ... \n", "3 [(DT, VBZ, DT), (VBZ, DT, RB), (DT, RB, JJ), (... \n", "4 [(NN, RB, VBD), (RB, VBD, RP), (VBD, RP, TO), ... \n", ".. ... \n", "315 [(DT, NN, VBD), (NN, VBD, RB), (VBD, RB, JJ), ... \n", "316 [(JJ, VBD, IN), (VBD, IN, DT), (IN, DT, NN), (... \n", "317 [(PRP, VBD, DT), (VBD, DT, NN), (DT, NN, IN), ... \n", "318 [(NN, VBP, VBG), (VBP, VBG, RB), (VBG, RB, RB)... \n", "319 [(PRP, VBD, DT), (VBD, DT, NN), (DT, NN, VBZ),... \n", "\n", " trigrams_feats \\\n", "0 [NN_VBD_TO, VBD_TO_VB, TO_VB_IN, VB_IN_PRP, IN... \n", "1 [JJ_VBD_IN, VBD_IN_DT, IN_DT_NN, DT_NN_NN, NN_... \n", "2 [DT_NN_VBD, NN_VBD_JJ, VBD_JJ_JJ, JJ_JJ_RB, JJ... \n", "3 [DT_VBZ_DT, VBZ_DT_RB, DT_RB_JJ, RB_JJ_NN, JJ_... \n", "4 [NN_RB_VBD, RB_VBD_RP, VBD_RP_TO, RP_TO_VB, TO... \n", ".. ... \n", "315 [DT_NN_VBD, NN_VBD_RB, VBD_RB_JJ, RB_JJ_PRP, J... \n", "316 [JJ_VBD_IN, VBD_IN_DT, IN_DT_NN, DT_NN_IN, NN_... \n", "317 [PRP_VBD_DT, VBD_DT_NN, DT_NN_IN, NN_IN_NNS, I... \n", "318 [NN_VBP_VBG, VBP_VBG_RB, VBG_RB_RB, RB_RB_CC, ... \n", "319 [PRP_VBD_DT, VBD_DT_NN, DT_NN_VBZ, NN_VBZ_JJ, ... \n", "\n", " trigrams_feats_bow \n", "0 {'NN_VBD_TO': 1, 'VBD_TO_VB': 1, 'TO_VB_IN': 1... \n", "1 {'JJ_VBD_IN': 1, 'VBD_IN_DT': 1, 'IN_DT_NN': 2... \n", "2 {'DT_NN_VBD': 4, 'NN_VBD_JJ': 4, 'VBD_JJ_JJ': ... \n", "3 {'DT_VBZ_DT': 1, 'VBZ_DT_RB': 1, 'DT_RB_JJ': 1... \n", "4 {'NN_RB_VBD': 1, 'RB_VBD_RP': 2, 'VBD_RP_TO': ... \n", ".. ... \n", "315 {'DT_NN_VBD': 3, 'NN_VBD_RB': 1, 'VBD_RB_JJ': ... \n", "316 {'JJ_VBD_IN': 1, 'VBD_IN_DT': 1, 'IN_DT_NN': 3... \n", "317 {'PRP_VBD_DT': 1, 'VBD_DT_NN': 1, 'DT_NN_IN': ... \n", "318 {'NN_VBP_VBG': 1, 'VBP_VBG_RB': 1, 'VBG_RB_RB'... \n", "319 {'PRP_VBD_DT': 1, 'VBD_DT_NN': 1, 'DT_NN_VBZ':... \n", "\n", "[320 rows x 27 columns]" ] }, "execution_count": 224, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }