{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW4 [Deception] "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 1: GET THAT DATA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 172,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "def get_data(file, path):\n",
    "    f=open(path+file)\n",
    "    data = f.read()\n",
    "    f.close()\n",
    "    return data\n",
    "    \n",
    "def get_data_from_files(path):\n",
    "    results = [get_data(file, path) for file in os.listdir(path)]\n",
    "    return results\n",
    "\n",
    "# pos = get_data_from_files('../pos_cornell//')\n",
    "# neg = get_data_from_files('../neg_cornell/')\n",
    "pos = get_data_from_files('../hw4_lie_false/')\n",
    "neg = get_data_from_files('../hw4_lie_true/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 173,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>I have been to a Asian restaurant in New York ...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>87</td>\n",
       "      <td>Mikes Pizza High Point NY Service was very slo...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>88</td>\n",
       "      <td>After I went shopping with some of my friend w...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>89</td>\n",
       "      <td>I entered the restaurant and a waitress came b...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>90</td>\n",
       "      <td>Carlos Plate Shack was the worst dining experi...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>91</td>\n",
       "      <td>Olive Oil Garden was very disappointing. I exp...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>92 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    0 PoN\n",
       "0                                                   ?   N\n",
       "1   Twin Trees Cicero NY HUGE salad bar and high q...   N\n",
       "2   The worst restaurant that I have ever eaten in...   N\n",
       "3                                                   ?   N\n",
       "4   I have been to a Asian restaurant in New York ...   N\n",
       "..                                                ...  ..\n",
       "87  Mikes Pizza High Point NY Service was very slo...   P\n",
       "88  After I went shopping with some of my friend w...   P\n",
       "89  I entered the restaurant and a waitress came b...   P\n",
       "90  Carlos Plate Shack was the worst dining experi...   P\n",
       "91  Olive Oil Garden was very disappointing. I exp...   P\n",
       "\n",
       "[92 rows x 2 columns]"
      ]
     },
     "execution_count": 173,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "neg_df = pd.DataFrame(neg)\n",
    "pos_df = pd.DataFrame(pos)\n",
    "pos_df['PoN'] = 'P'\n",
    "neg_df['PoN'] = 'N'\n",
    "all_df = neg_df.append(pos_df)\n",
    "all_df.reset_index(drop=True,inplace=True)\n",
    "all_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 2: TOKENIZE"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 174,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### -- 2a by sentence"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 175,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_sentence_tokens(review):\n",
    "    return sent_tokenize(review)\n",
    "    \n",
    "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n",
    "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### -- 2b by word"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 176,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n",
    "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 177,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "      <td>[Twin Trees Cicero NY HUGE salad bar and high ...</td>\n",
       "      <td>4</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, an...</td>\n",
       "      <td>53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "      <td>[The worst restaurant that I have ever eaten i...</td>\n",
       "      <td>5</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>105</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN  \\\n",
       "0                                                  ?   N   \n",
       "1  Twin Trees Cicero NY HUGE salad bar and high q...   N   \n",
       "2  The worst restaurant that I have ever eaten in...   N   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0                                                [?]              1   \n",
       "1  [Twin Trees Cicero NY HUGE salad bar and high ...              4   \n",
       "2  [The worst restaurant that I have ever eaten i...              5   \n",
       "\n",
       "                                              tokens  num_tokens  \n",
       "0                                                 []           0  \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, an...          53  \n",
       "2  [the, worst, restaurant, that, i, have, ever, ...         105  "
      ]
     },
     "execution_count": 177,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 3: EXPERIMENT\n",
    "#### Experiment with: stopwords, stemming, lemming etc."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### -- 3a remove english stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 178,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "def remove_stopwords(sentence):\n",
    "    filtered_text = []\n",
    "    for word in sentence:\n",
    "        if word not in stop_words:\n",
    "            filtered_text.append(word)\n",
    "    return filtered_text\n",
    "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n",
    "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 179,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "      <td>[Twin Trees Cicero NY HUGE salad bar and high ...</td>\n",
       "      <td>4</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, an...</td>\n",
       "      <td>53</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, hi...</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "      <td>[The worst restaurant that I have ever eaten i...</td>\n",
       "      <td>5</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>105</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "      <td>49</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN  \\\n",
       "0                                                  ?   N   \n",
       "1  Twin Trees Cicero NY HUGE salad bar and high q...   N   \n",
       "2  The worst restaurant that I have ever eaten in...   N   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0                                                [?]              1   \n",
       "1  [Twin Trees Cicero NY HUGE salad bar and high ...              4   \n",
       "2  [The worst restaurant that I have ever eaten i...              5   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0                                                 []           0   \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, an...          53   \n",
       "2  [the, worst, restaurant, that, i, have, ever, ...         105   \n",
       "\n",
       "                                               no_sw  num_no_sw  \n",
       "0                                                 []          0  \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, hi...         32  \n",
       "2  [worst, restaurant, ever, eaten, undoubtedly, ...         49  "
      ]
     },
     "execution_count": 179,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### -- 3b get stems for both tokens and no_sw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 180,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem import PorterStemmer\n",
    "def get_stems(sentence):\n",
    "    ps = PorterStemmer()\n",
    "    return [ps.stem(w) for w in sentence]\n",
    "    \n",
    "all_df['stemmed'] = all_df.apply(lambda x: get_stems(x['tokens']),axis=1)\n",
    "all_df['stemmed_no_sw'] = all_df.apply(lambda x: get_stems(x['no_sw']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 181,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>stemmed</th>\n",
       "      <th>stemmed_no_sw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "      <td>[Twin Trees Cicero NY HUGE salad bar and high ...</td>\n",
       "      <td>4</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, an...</td>\n",
       "      <td>53</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, hi...</td>\n",
       "      <td>32</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, and...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, hig...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "      <td>[The worst restaurant that I have ever eaten i...</td>\n",
       "      <td>5</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>105</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "      <td>49</td>\n",
       "      <td>[the, worst, restaur, that, i, have, ever, eat...</td>\n",
       "      <td>[worst, restaur, ever, eaten, undoubtedli, pla...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN  \\\n",
       "0                                                  ?   N   \n",
       "1  Twin Trees Cicero NY HUGE salad bar and high q...   N   \n",
       "2  The worst restaurant that I have ever eaten in...   N   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0                                                [?]              1   \n",
       "1  [Twin Trees Cicero NY HUGE salad bar and high ...              4   \n",
       "2  [The worst restaurant that I have ever eaten i...              5   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0                                                 []           0   \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, an...          53   \n",
       "2  [the, worst, restaurant, that, i, have, ever, ...         105   \n",
       "\n",
       "                                               no_sw  num_no_sw  \\\n",
       "0                                                 []          0   \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, hi...         32   \n",
       "2  [worst, restaurant, ever, eaten, undoubtedly, ...         49   \n",
       "\n",
       "                                             stemmed  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, and...   \n",
       "2  [the, worst, restaur, that, i, have, ever, eat...   \n",
       "\n",
       "                                       stemmed_no_sw  \n",
       "0                                                 []  \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, hig...  \n",
       "2  [worst, restaur, ever, eaten, undoubtedli, pla...  "
      ]
     },
     "execution_count": 181,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### -- 3c get lemmas for both tokens and no_sw"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 182,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.stem.wordnet import WordNetLemmatizer\n",
    "def get_lemmas(sentence):\n",
    "    lem = WordNetLemmatizer() \n",
    "    return [lem.lemmatize(w) for w in sentence]\n",
    "    \n",
    "all_df['lemmed'] = all_df.apply(lambda x: get_lemmas(x['tokens']),axis=1)\n",
    "all_df['lemmed_no_sw'] = all_df.apply(lambda x: get_lemmas(x['no_sw']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 183,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>stemmed</th>\n",
       "      <th>stemmed_no_sw</th>\n",
       "      <th>lemmed</th>\n",
       "      <th>lemmed_no_sw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "      <td>[Twin Trees Cicero NY HUGE salad bar and high ...</td>\n",
       "      <td>4</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, an...</td>\n",
       "      <td>53</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, hi...</td>\n",
       "      <td>32</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, and...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, hig...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, and...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, hig...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "      <td>[The worst restaurant that I have ever eaten i...</td>\n",
       "      <td>5</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>105</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "      <td>49</td>\n",
       "      <td>[the, worst, restaur, that, i, have, ever, eat...</td>\n",
       "      <td>[worst, restaur, ever, eaten, undoubtedli, pla...</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN  \\\n",
       "0                                                  ?   N   \n",
       "1  Twin Trees Cicero NY HUGE salad bar and high q...   N   \n",
       "2  The worst restaurant that I have ever eaten in...   N   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0                                                [?]              1   \n",
       "1  [Twin Trees Cicero NY HUGE salad bar and high ...              4   \n",
       "2  [The worst restaurant that I have ever eaten i...              5   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0                                                 []           0   \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, an...          53   \n",
       "2  [the, worst, restaurant, that, i, have, ever, ...         105   \n",
       "\n",
       "                                               no_sw  num_no_sw  \\\n",
       "0                                                 []          0   \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, hi...         32   \n",
       "2  [worst, restaurant, ever, eaten, undoubtedly, ...         49   \n",
       "\n",
       "                                             stemmed  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, and...   \n",
       "2  [the, worst, restaur, that, i, have, ever, eat...   \n",
       "\n",
       "                                       stemmed_no_sw  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, hig...   \n",
       "2  [worst, restaur, ever, eaten, undoubtedli, pla...   \n",
       "\n",
       "                                              lemmed  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, and...   \n",
       "2  [the, worst, restaurant, that, i, have, ever, ...   \n",
       "\n",
       "                                        lemmed_no_sw  \n",
       "0                                                 []  \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, hig...  \n",
       "2  [worst, restaurant, ever, eaten, undoubtedly, ...  "
      ]
     },
     "execution_count": 183,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 184,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['pos'] = all_df.apply(lambda x: nltk.pos_tag(x['tokens']),axis=1)\n",
    "all_df['pos_no_sw'] = all_df.apply(lambda x: nltk.pos_tag(x['no_sw']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 185,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>stemmed</th>\n",
       "      <th>stemmed_no_sw</th>\n",
       "      <th>lemmed</th>\n",
       "      <th>lemmed_no_sw</th>\n",
       "      <th>pos</th>\n",
       "      <th>pos_no_sw</th>\n",
       "      <th>pos_dict</th>\n",
       "      <th>pos_dict_no_sw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>{}</td>\n",
       "      <td>{}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "      <td>[Twin Trees Cicero NY HUGE salad bar and high ...</td>\n",
       "      <td>4</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, an...</td>\n",
       "      <td>53</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, hi...</td>\n",
       "      <td>32</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, and...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, hig...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, and...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, hig...</td>\n",
       "      <td>[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...</td>\n",
       "      <td>[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...</td>\n",
       "      <td>{'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ...</td>\n",
       "      <td>{'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "      <td>[The worst restaurant that I have ever eaten i...</td>\n",
       "      <td>5</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>105</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "      <td>49</td>\n",
       "      <td>[the, worst, restaur, that, i, have, ever, eat...</td>\n",
       "      <td>[worst, restaur, ever, eaten, undoubtedli, pla...</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "      <td>[(the, DT), (worst, JJS), (restaurant, NN), (t...</td>\n",
       "      <td>[(worst, RBS), (restaurant, NN), (ever, RB), (...</td>\n",
       "      <td>{'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':...</td>\n",
       "      <td>{'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN  \\\n",
       "0                                                  ?   N   \n",
       "1  Twin Trees Cicero NY HUGE salad bar and high q...   N   \n",
       "2  The worst restaurant that I have ever eaten in...   N   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0                                                [?]              1   \n",
       "1  [Twin Trees Cicero NY HUGE salad bar and high ...              4   \n",
       "2  [The worst restaurant that I have ever eaten i...              5   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0                                                 []           0   \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, an...          53   \n",
       "2  [the, worst, restaurant, that, i, have, ever, ...         105   \n",
       "\n",
       "                                               no_sw  num_no_sw  \\\n",
       "0                                                 []          0   \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, hi...         32   \n",
       "2  [worst, restaurant, ever, eaten, undoubtedly, ...         49   \n",
       "\n",
       "                                             stemmed  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, and...   \n",
       "2  [the, worst, restaur, that, i, have, ever, eat...   \n",
       "\n",
       "                                       stemmed_no_sw  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, hig...   \n",
       "2  [worst, restaur, ever, eaten, undoubtedli, pla...   \n",
       "\n",
       "                                              lemmed  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, and...   \n",
       "2  [the, worst, restaurant, that, i, have, ever, ...   \n",
       "\n",
       "                                        lemmed_no_sw  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, hig...   \n",
       "2  [worst, restaurant, ever, eaten, undoubtedly, ...   \n",
       "\n",
       "                                                 pos  \\\n",
       "0                                                 []   \n",
       "1  [(twin, NN), (trees, NNS), (cicero, VBP), (ny,...   \n",
       "2  [(the, DT), (worst, JJS), (restaurant, NN), (t...   \n",
       "\n",
       "                                           pos_no_sw  \\\n",
       "0                                                 []   \n",
       "1  [(twin, NN), (trees, NNS), (cicero, VBP), (ny,...   \n",
       "2  [(worst, RBS), (restaurant, NN), (ever, RB), (...   \n",
       "\n",
       "                                            pos_dict  \\\n",
       "0                                                 {}   \n",
       "1  {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ...   \n",
       "2  {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':...   \n",
       "\n",
       "                                      pos_dict_no_sw  \n",
       "0                                                 {}  \n",
       "1  {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...  \n",
       "2  {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...  "
      ]
     },
     "execution_count": 185,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_pos_dict(pos_tuple):\n",
    "    pos_dict = {}\n",
    "    for t in pos_tuple:\n",
    "        if t[1] in pos_dict.keys():\n",
    "            pos_dict[t[1]] += 1\n",
    "        else:\n",
    "            pos_dict.update({t[1]: 1})\n",
    "    return pos_dict\n",
    "\n",
    "all_df['pos_dict'] = all_df.apply(lambda x: get_pos_dict(x['pos']), axis=1)\n",
    "all_df['pos_dict_no_sw'] = all_df.apply(lambda x: get_pos_dict(x['pos_no_sw']), axis=1)\n",
    "all_df[:3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 198,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>stemmed</th>\n",
       "      <th>stemmed_no_sw</th>\n",
       "      <th>lemmed</th>\n",
       "      <th>lemmed_no_sw</th>\n",
       "      <th>pos</th>\n",
       "      <th>pos_no_sw</th>\n",
       "      <th>pos_dict</th>\n",
       "      <th>pos_dict_no_sw</th>\n",
       "      <th>bow</th>\n",
       "      <th>bow_no_sw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>{}</td>\n",
       "      <td>{}</td>\n",
       "      <td>{}</td>\n",
       "      <td>{}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "      <td>[Twin Trees Cicero NY HUGE salad bar and high ...</td>\n",
       "      <td>4</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, an...</td>\n",
       "      <td>53</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, hi...</td>\n",
       "      <td>32</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, and...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, hig...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, and...</td>\n",
       "      <td>[twin, tree, cicero, ny, huge, salad, bar, hig...</td>\n",
       "      <td>[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...</td>\n",
       "      <td>[(twin, NN), (trees, NNS), (cicero, VBP), (ny,...</td>\n",
       "      <td>{'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ...</td>\n",
       "      <td>{'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...</td>\n",
       "      <td>{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...</td>\n",
       "      <td>{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "      <td>[The worst restaurant that I have ever eaten i...</td>\n",
       "      <td>5</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>105</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "      <td>49</td>\n",
       "      <td>[the, worst, restaur, that, i, have, ever, eat...</td>\n",
       "      <td>[worst, restaur, ever, eaten, undoubtedli, pla...</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "      <td>[(the, DT), (worst, JJS), (restaurant, NN), (t...</td>\n",
       "      <td>[(worst, RBS), (restaurant, NN), (ever, RB), (...</td>\n",
       "      <td>{'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':...</td>\n",
       "      <td>{'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...</td>\n",
       "      <td>{'the': 6, 'worst': 1, 'restaurant': 1, 'that'...</td>\n",
       "      <td>{'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN  \\\n",
       "0                                                  ?   N   \n",
       "1  Twin Trees Cicero NY HUGE salad bar and high q...   N   \n",
       "2  The worst restaurant that I have ever eaten in...   N   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0                                                [?]              1   \n",
       "1  [Twin Trees Cicero NY HUGE salad bar and high ...              4   \n",
       "2  [The worst restaurant that I have ever eaten i...              5   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0                                                 []           0   \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, an...          53   \n",
       "2  [the, worst, restaurant, that, i, have, ever, ...         105   \n",
       "\n",
       "                                               no_sw  num_no_sw  \\\n",
       "0                                                 []          0   \n",
       "1  [twin, trees, cicero, ny, huge, salad, bar, hi...         32   \n",
       "2  [worst, restaurant, ever, eaten, undoubtedly, ...         49   \n",
       "\n",
       "                                             stemmed  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, and...   \n",
       "2  [the, worst, restaur, that, i, have, ever, eat...   \n",
       "\n",
       "                                       stemmed_no_sw  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, hig...   \n",
       "2  [worst, restaur, ever, eaten, undoubtedli, pla...   \n",
       "\n",
       "                                              lemmed  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, and...   \n",
       "2  [the, worst, restaurant, that, i, have, ever, ...   \n",
       "\n",
       "                                        lemmed_no_sw  \\\n",
       "0                                                 []   \n",
       "1  [twin, tree, cicero, ny, huge, salad, bar, hig...   \n",
       "2  [worst, restaurant, ever, eaten, undoubtedly, ...   \n",
       "\n",
       "                                                 pos  \\\n",
       "0                                                 []   \n",
       "1  [(twin, NN), (trees, NNS), (cicero, VBP), (ny,...   \n",
       "2  [(the, DT), (worst, JJS), (restaurant, NN), (t...   \n",
       "\n",
       "                                           pos_no_sw  \\\n",
       "0                                                 []   \n",
       "1  [(twin, NN), (trees, NNS), (cicero, VBP), (ny,...   \n",
       "2  [(worst, RBS), (restaurant, NN), (ever, RB), (...   \n",
       "\n",
       "                                            pos_dict  \\\n",
       "0                                                 {}   \n",
       "1  {'NN': 11, 'NNS': 3, 'VBP': 3, 'JJ': 9, 'CC': ...   \n",
       "2  {'DT': 14, 'JJS': 1, 'NN': 29, 'IN': 8, 'VBP':...   \n",
       "\n",
       "                                      pos_dict_no_sw  \\\n",
       "0                                                 {}   \n",
       "1  {'NN': 7, 'NNS': 5, 'VBP': 3, 'JJ': 10, 'RB': ...   \n",
       "2  {'RBS': 1, 'NN': 24, 'RB': 5, 'JJ': 9, 'VBN': ...   \n",
       "\n",
       "                                                 bow  \\\n",
       "0                                                 {}   \n",
       "1  {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...   \n",
       "2  {'the': 6, 'worst': 1, 'restaurant': 1, 'that'...   \n",
       "\n",
       "                                           bow_no_sw  \n",
       "0                                                 {}  \n",
       "1  {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...  \n",
       "2  {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...  "
      ]
     },
     "execution_count": 198,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def get_bow_from_tokens(df, column):\n",
    "    all_column_data = ' '.join(df[column].tolist())\n",
    "    all_column_fd = Counter(all_column_data.split())\n",
    "    return all_column_fd\n",
    "\n",
    "# bow = get_bow_from_column(all_df, 'diy_cleaner')\n",
    "# bow =\n",
    "from collections import Counter\n",
    "all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "all_df['bow_no_sw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)\n",
    "all_df[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 4: TEST EXPERIMENTS!!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 203,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "def get_NB(small_df, labels):\n",
    "    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n",
    "\n",
    "    gnb = GaussianNB()\n",
    "    gnb.fit(x_train, y_train)\n",
    "    y_pred = gnb.predict(x_test)\n",
    "    from sklearn import metrics\n",
    "    print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 204,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5714285714285714\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NN</th>\n",
       "      <th>NNS</th>\n",
       "      <th>VBP</th>\n",
       "      <th>JJ</th>\n",
       "      <th>CC</th>\n",
       "      <th>VBZ</th>\n",
       "      <th>DT</th>\n",
       "      <th>RB</th>\n",
       "      <th>VB</th>\n",
       "      <th>TO</th>\n",
       "      <th>...</th>\n",
       "      <th>VBG</th>\n",
       "      <th>EX</th>\n",
       "      <th>JJR</th>\n",
       "      <th>PDT</th>\n",
       "      <th>RP</th>\n",
       "      <th>WP</th>\n",
       "      <th>CD</th>\n",
       "      <th>RBR</th>\n",
       "      <th>MD</th>\n",
       "      <th>RBS</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PoN</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>11</td>\n",
       "      <td>3</td>\n",
       "      <td>3</td>\n",
       "      <td>9</td>\n",
       "      <td>3</td>\n",
       "      <td>2</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>3</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>29</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>7</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>14</td>\n",
       "      <td>8</td>\n",
       "      <td>4</td>\n",
       "      <td>4</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 28 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     NN  NNS  VBP  JJ  CC  VBZ  DT  RB  VB  TO  ...  VBG  EX  JJR  PDT  RP  \\\n",
       "PoN                                             ...                          \n",
       "N     0    0    0   0   0    0   0   0   0   0  ...    0   0    0    0   0   \n",
       "N    11    3    3   9   3    2   4   4   4   3  ...    0   0    0    0   0   \n",
       "N    29    1    1   7   5    1  14   8   4   4  ...    1   0    0    0   0   \n",
       "N     0    0    0   0   0    0   0   0   0   0  ...    0   0    0    0   0   \n",
       "N    13    2    2   5   1    2   5   0   0   1  ...    0   0    0    0   0   \n",
       "\n",
       "     WP  CD  RBR  MD  RBS  \n",
       "PoN                        \n",
       "N     0   0    0   0    0  \n",
       "N     0   0    0   0    0  \n",
       "N     0   0    0   0    0  \n",
       "N     0   0    0   0    0  \n",
       "N     0   0    0   0    0  \n",
       "\n",
       "[5 rows x 28 columns]"
      ]
     },
     "execution_count": 204,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "new_df = pd.DataFrame(all_df['pos_dict'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "get_NB(new_df, new_df.index)\n",
    "new_df[:5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 219,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def normalize_df(df):\n",
    "#     df[\"total\"] = df.sum(axis = 1)\n",
    "#     df = df.apply(lambda row: row/row[\"total\"], axis = 1)\n",
    "#     df.drop(\"total\", axis=1, inplace = True)\n",
    "#     return(df)\n",
    "\n",
    "def normalize_df(df):\n",
    "    names = df.columns\n",
    "    df[\"total\"] = df.sum(axis = 1)\n",
    "    for name in names:\n",
    "        df[name] = df[name]/df[\"total\"]\n",
    "    df.drop(\"total\", axis =1 , inplace = True)\n",
    "    return(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 222,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>NN</th>\n",
       "      <th>NNS</th>\n",
       "      <th>VBP</th>\n",
       "      <th>JJ</th>\n",
       "      <th>CC</th>\n",
       "      <th>VBZ</th>\n",
       "      <th>DT</th>\n",
       "      <th>RB</th>\n",
       "      <th>VB</th>\n",
       "      <th>TO</th>\n",
       "      <th>...</th>\n",
       "      <th>VBG</th>\n",
       "      <th>EX</th>\n",
       "      <th>JJR</th>\n",
       "      <th>PDT</th>\n",
       "      <th>RP</th>\n",
       "      <th>WP</th>\n",
       "      <th>CD</th>\n",
       "      <th>RBR</th>\n",
       "      <th>MD</th>\n",
       "      <th>RBS</th>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>PoN</th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "      <th></th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>0.207547</td>\n",
       "      <td>0.056604</td>\n",
       "      <td>0.056604</td>\n",
       "      <td>0.169811</td>\n",
       "      <td>0.056604</td>\n",
       "      <td>0.037736</td>\n",
       "      <td>0.075472</td>\n",
       "      <td>0.075472</td>\n",
       "      <td>0.075472</td>\n",
       "      <td>0.056604</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>0.276190</td>\n",
       "      <td>0.009524</td>\n",
       "      <td>0.009524</td>\n",
       "      <td>0.066667</td>\n",
       "      <td>0.047619</td>\n",
       "      <td>0.009524</td>\n",
       "      <td>0.133333</td>\n",
       "      <td>0.076190</td>\n",
       "      <td>0.038095</td>\n",
       "      <td>0.038095</td>\n",
       "      <td>...</td>\n",
       "      <td>0.009524</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>N</td>\n",
       "      <td>0.288889</td>\n",
       "      <td>0.044444</td>\n",
       "      <td>0.044444</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.022222</td>\n",
       "      <td>0.044444</td>\n",
       "      <td>0.111111</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.022222</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>P</td>\n",
       "      <td>0.162791</td>\n",
       "      <td>0.046512</td>\n",
       "      <td>0.023256</td>\n",
       "      <td>0.139535</td>\n",
       "      <td>0.046512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.046512</td>\n",
       "      <td>0.069767</td>\n",
       "      <td>0.093023</td>\n",
       "      <td>0.046512</td>\n",
       "      <td>...</td>\n",
       "      <td>0.023256</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.069767</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>P</td>\n",
       "      <td>0.208333</td>\n",
       "      <td>0.041667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.041667</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.083333</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.041667</td>\n",
       "      <td>0.041667</td>\n",
       "      <td>...</td>\n",
       "      <td>0.041667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.041667</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>P</td>\n",
       "      <td>0.191919</td>\n",
       "      <td>0.010101</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.070707</td>\n",
       "      <td>0.070707</td>\n",
       "      <td>0.010101</td>\n",
       "      <td>0.141414</td>\n",
       "      <td>0.101010</td>\n",
       "      <td>0.070707</td>\n",
       "      <td>0.040404</td>\n",
       "      <td>...</td>\n",
       "      <td>0.010101</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.010101</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.010101</td>\n",
       "      <td>0.020202</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>P</td>\n",
       "      <td>0.206452</td>\n",
       "      <td>0.045161</td>\n",
       "      <td>0.019355</td>\n",
       "      <td>0.090323</td>\n",
       "      <td>0.045161</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.096774</td>\n",
       "      <td>0.070968</td>\n",
       "      <td>0.045161</td>\n",
       "      <td>0.045161</td>\n",
       "      <td>...</td>\n",
       "      <td>0.012903</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.025806</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>P</td>\n",
       "      <td>0.232558</td>\n",
       "      <td>0.023256</td>\n",
       "      <td>0.046512</td>\n",
       "      <td>0.139535</td>\n",
       "      <td>0.046512</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.093023</td>\n",
       "      <td>0.069767</td>\n",
       "      <td>0.046512</td>\n",
       "      <td>0.046512</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.023256</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>92 rows × 28 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "           NN       NNS       VBP        JJ        CC       VBZ        DT  \\\n",
       "PoN                                                                         \n",
       "N         NaN       NaN       NaN       NaN       NaN       NaN       NaN   \n",
       "N    0.207547  0.056604  0.056604  0.169811  0.056604  0.037736  0.075472   \n",
       "N    0.276190  0.009524  0.009524  0.066667  0.047619  0.009524  0.133333   \n",
       "N         NaN       NaN       NaN       NaN       NaN       NaN       NaN   \n",
       "N    0.288889  0.044444  0.044444  0.111111  0.022222  0.044444  0.111111   \n",
       "..        ...       ...       ...       ...       ...       ...       ...   \n",
       "P    0.162791  0.046512  0.023256  0.139535  0.046512  0.000000  0.046512   \n",
       "P    0.208333  0.041667  0.000000  0.041667  0.000000  0.000000  0.083333   \n",
       "P    0.191919  0.010101  0.000000  0.070707  0.070707  0.010101  0.141414   \n",
       "P    0.206452  0.045161  0.019355  0.090323  0.045161  0.000000  0.096774   \n",
       "P    0.232558  0.023256  0.046512  0.139535  0.046512  0.000000  0.093023   \n",
       "\n",
       "           RB        VB        TO  ...       VBG   EX  JJR  PDT        RP  \\\n",
       "PoN                                ...                                      \n",
       "N         NaN       NaN       NaN  ...       NaN  NaN  NaN  NaN       NaN   \n",
       "N    0.075472  0.075472  0.056604  ...  0.000000  0.0  0.0  0.0  0.000000   \n",
       "N    0.076190  0.038095  0.038095  ...  0.009524  0.0  0.0  0.0  0.000000   \n",
       "N         NaN       NaN       NaN  ...       NaN  NaN  NaN  NaN       NaN   \n",
       "N    0.000000  0.000000  0.022222  ...  0.000000  0.0  0.0  0.0  0.000000   \n",
       "..        ...       ...       ...  ...       ...  ...  ...  ...       ...   \n",
       "P    0.069767  0.093023  0.046512  ...  0.023256  0.0  0.0  0.0  0.000000   \n",
       "P    0.000000  0.041667  0.041667  ...  0.041667  0.0  0.0  0.0  0.000000   \n",
       "P    0.101010  0.070707  0.040404  ...  0.010101  0.0  0.0  0.0  0.000000   \n",
       "P    0.070968  0.045161  0.045161  ...  0.012903  0.0  0.0  0.0  0.025806   \n",
       "P    0.069767  0.046512  0.046512  ...  0.000000  0.0  0.0  0.0  0.023256   \n",
       "\n",
       "      WP        CD  RBR        MD       RBS  \n",
       "PoN                                          \n",
       "N    NaN       NaN  NaN       NaN       NaN  \n",
       "N    0.0  0.000000  0.0  0.000000  0.000000  \n",
       "N    0.0  0.000000  0.0  0.000000  0.000000  \n",
       "N    NaN       NaN  NaN       NaN       NaN  \n",
       "N    0.0  0.000000  0.0  0.000000  0.000000  \n",
       "..   ...       ...  ...       ...       ...  \n",
       "P    0.0  0.000000  0.0  0.069767  0.000000  \n",
       "P    0.0  0.041667  0.0  0.000000  0.000000  \n",
       "P    0.0  0.010101  0.0  0.010101  0.020202  \n",
       "P    0.0  0.000000  0.0  0.000000  0.000000  \n",
       "P    0.0  0.000000  0.0  0.000000  0.000000  \n",
       "\n",
       "[92 rows x 28 columns]"
      ]
     },
     "execution_count": 222,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "norm_df = normalize_df(new_df)\n",
    "\n",
    "# new_df['total'] = new_df.sum(axis = 1)\n",
    "# new_df_norm = new_df.copy()\n",
    "# new_df_norm = new_df_norm.apply(lambda x: x/x['total'], axis=1)\n",
    "\n",
    "# new_df_norm = new_df_norm.drop('total', axis=1)\n",
    "# norm_df = norm_df.fillna(0).astype(int)\n",
    "# get_NB(new_df_norm, new_df_norm.index)\n",
    "# new_df_norm[:5]\n",
    "norm_df\n",
    "# new_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 197,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5\n"
     ]
    }
   ],
   "source": [
    "new_df = pd.DataFrame(all_df['bow_no_sw'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "get_NB(new_df, new_df.index)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}