{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW2: VECTORIZATION (Pandas style!)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 1: Import ALL the things\n",
    "### Import libraries  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "##########################################\n",
    "# NOTE: I'm toying with the idea of requiring the library just above \n",
    "# when I use it so it makes more sense in context\n",
    "##########################################\n",
    "# import os\n",
    "# import pandas as pd\n",
    "# from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "# from nltk.sentiment import SentimentAnalyzer\n",
    "# from nltk.sentiment.util import *\n",
    "# from nltk.probability import FreqDist\n",
    "# from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "# sid = SentimentIntensityAnalyzer()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Import data from files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file)\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "# neg = get_data_from_files('../neg_cornell/')\n",
    "# pos = get_data_from_files('../pos_cornell/')\n",
    "\n",
    "# v1\n",
    "# neg = get_data_from_files('../hw4_lie_false/')\n",
    "# pos = get_data_from_files('../hw4_lie_true/')\n",
    "\n",
    "pos = get_data_from_files('../hw4_lie_false/')\n",
    "neg = get_data_from_files('../hw4_lie_true/')\n",
    "\n",
    "# neg = get_data_from_files('../neg_hw4/')\n",
    "# pos = get_data_from_files('../pos_hw4/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 2: Prep Data\n",
    "### STEP 2a: Turn that fresh text into a pandas DF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "neg_df = pd.DataFrame(neg)\n",
    "pos_df = pd.DataFrame(pos)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 2b: Label it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_df['PoN'] = 'P'\n",
    "neg_df['PoN'] = 'N'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 2c: Combine the dfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df = neg_df.append(pos_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>I have been to a Asian restaurant in New York ...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>41</td>\n",
       "      <td>Mikes Pizza High Point NY Service was very slo...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>42</td>\n",
       "      <td>After I went shopping with some of my friend w...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>43</td>\n",
       "      <td>I entered the restaurant and a waitress came b...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>44</td>\n",
       "      <td>Carlos Plate Shack was the worst dining experi...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45</td>\n",
       "      <td>Olive Oil Garden was very disappointing. I exp...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>92 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    0 PoN\n",
       "0                                                   ?   N\n",
       "1   Twin Trees Cicero NY HUGE salad bar and high q...   N\n",
       "2   The worst restaurant that I have ever eaten in...   N\n",
       "3                                                   ?   N\n",
       "4   I have been to a Asian restaurant in New York ...   N\n",
       "..                                                ...  ..\n",
       "41  Mikes Pizza High Point NY Service was very slo...   P\n",
       "42  After I went shopping with some of my friend w...   P\n",
       "43  I entered the restaurant and a waitress came b...   P\n",
       "44  Carlos Plate Shack was the worst dining experi...   P\n",
       "45  Olive Oil Garden was very disappointing. I exp...   P\n",
       "\n",
       "[92 rows x 2 columns]"
      ]
     },
     "execution_count": 87,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 3: TOKENIZE (and clean)!!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Came back and added sentences for tokinization for \"Summary experiment\"\n",
    "def get_sentence_tokens(review):\n",
    "    return sent_tokenize(review)\n",
    "    \n",
    "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n",
    "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 90,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n",
    "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "      <td>[Twin Trees Cicero NY HUGE salad bar and high ...</td>\n",
       "      <td>4</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, an...</td>\n",
       "      <td>53</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "      <td>[The worst restaurant that I have ever eaten i...</td>\n",
       "      <td>5</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>105</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>I have been to a Asian restaurant in New York ...</td>\n",
       "      <td>N</td>\n",
       "      <td>[I have been to a Asian restaurant in New York...</td>\n",
       "      <td>4</td>\n",
       "      <td>[i, have, been, to, a, asian, restaurant, in, ...</td>\n",
       "      <td>45</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>41</td>\n",
       "      <td>Mikes Pizza High Point NY Service was very slo...</td>\n",
       "      <td>P</td>\n",
       "      <td>[Mikes Pizza High Point NY Service was very sl...</td>\n",
       "      <td>4</td>\n",
       "      <td>[mikes, pizza, high, point, ny, service, was, ...</td>\n",
       "      <td>43</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>42</td>\n",
       "      <td>After I went shopping with some of my friend w...</td>\n",
       "      <td>P</td>\n",
       "      <td>[After I went shopping with some of my friend ...</td>\n",
       "      <td>2</td>\n",
       "      <td>[after, i, went, shopping, with, some, of, my,...</td>\n",
       "      <td>24</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>43</td>\n",
       "      <td>I entered the restaurant and a waitress came b...</td>\n",
       "      <td>P</td>\n",
       "      <td>[I entered the restaurant and a waitress came ...</td>\n",
       "      <td>5</td>\n",
       "      <td>[i, entered, the, restaurant, and, a, waitress...</td>\n",
       "      <td>99</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>44</td>\n",
       "      <td>Carlos Plate Shack was the worst dining experi...</td>\n",
       "      <td>P</td>\n",
       "      <td>[Carlos Plate Shack was the worst dining exper...</td>\n",
       "      <td>9</td>\n",
       "      <td>[carlos, plate, shack, was, the, worst, dining...</td>\n",
       "      <td>155</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45</td>\n",
       "      <td>Olive Oil Garden was very disappointing. I exp...</td>\n",
       "      <td>P</td>\n",
       "      <td>[Olive Oil Garden was very disappointing., I e...</td>\n",
       "      <td>5</td>\n",
       "      <td>[olive, oil, garden, was, very, disappointing,...</td>\n",
       "      <td>43</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>92 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    0 PoN  \\\n",
       "0                                                   ?   N   \n",
       "1   Twin Trees Cicero NY HUGE salad bar and high q...   N   \n",
       "2   The worst restaurant that I have ever eaten in...   N   \n",
       "3                                                   ?   N   \n",
       "4   I have been to a Asian restaurant in New York ...   N   \n",
       "..                                                ...  ..   \n",
       "41  Mikes Pizza High Point NY Service was very slo...   P   \n",
       "42  After I went shopping with some of my friend w...   P   \n",
       "43  I entered the restaurant and a waitress came b...   P   \n",
       "44  Carlos Plate Shack was the worst dining experi...   P   \n",
       "45  Olive Oil Garden was very disappointing. I exp...   P   \n",
       "\n",
       "                                            sentences  num_sentences  \\\n",
       "0                                                 [?]              1   \n",
       "1   [Twin Trees Cicero NY HUGE salad bar and high ...              4   \n",
       "2   [The worst restaurant that I have ever eaten i...              5   \n",
       "3                                                 [?]              1   \n",
       "4   [I have been to a Asian restaurant in New York...              4   \n",
       "..                                                ...            ...   \n",
       "41  [Mikes Pizza High Point NY Service was very sl...              4   \n",
       "42  [After I went shopping with some of my friend ...              2   \n",
       "43  [I entered the restaurant and a waitress came ...              5   \n",
       "44  [Carlos Plate Shack was the worst dining exper...              9   \n",
       "45  [Olive Oil Garden was very disappointing., I e...              5   \n",
       "\n",
       "                                               tokens  num_tokens  \n",
       "0                                                  []           0  \n",
       "1   [twin, trees, cicero, ny, huge, salad, bar, an...          53  \n",
       "2   [the, worst, restaurant, that, i, have, ever, ...         105  \n",
       "3                                                  []           0  \n",
       "4   [i, have, been, to, a, asian, restaurant, in, ...          45  \n",
       "..                                                ...         ...  \n",
       "41  [mikes, pizza, high, point, ny, service, was, ...          43  \n",
       "42  [after, i, went, shopping, with, some, of, my,...          24  \n",
       "43  [i, entered, the, restaurant, and, a, waitress...          99  \n",
       "44  [carlos, plate, shack, was, the, worst, dining...         155  \n",
       "45  [olive, oil, garden, was, very, disappointing,...          43  \n",
       "\n",
       "[92 rows x 6 columns]"
      ]
     },
     "execution_count": 91,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 4: Remove Stopwords"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "def remove_stopwords(sentence):\n",
    "    filtered_text = []\n",
    "    for word in sentence:\n",
    "        if word not in stop_words:\n",
    "            filtered_text.append(word)\n",
    "    return filtered_text\n",
    "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n",
    "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 93,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "      <td>[Twin Trees Cicero NY HUGE salad bar and high ...</td>\n",
       "      <td>4</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, an...</td>\n",
       "      <td>53</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, hi...</td>\n",
       "      <td>32</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "      <td>[The worst restaurant that I have ever eaten i...</td>\n",
       "      <td>5</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>105</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "      <td>49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>I have been to a Asian restaurant in New York ...</td>\n",
       "      <td>N</td>\n",
       "      <td>[I have been to a Asian restaurant in New York...</td>\n",
       "      <td>4</td>\n",
       "      <td>[i, have, been, to, a, asian, restaurant, in, ...</td>\n",
       "      <td>45</td>\n",
       "      <td>[asian, restaurant, new, york, city, menu, wri...</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>41</td>\n",
       "      <td>Mikes Pizza High Point NY Service was very slo...</td>\n",
       "      <td>P</td>\n",
       "      <td>[Mikes Pizza High Point NY Service was very sl...</td>\n",
       "      <td>4</td>\n",
       "      <td>[mikes, pizza, high, point, ny, service, was, ...</td>\n",
       "      <td>43</td>\n",
       "      <td>[mikes, pizza, high, point, ny, service, slow,...</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>42</td>\n",
       "      <td>After I went shopping with some of my friend w...</td>\n",
       "      <td>P</td>\n",
       "      <td>[After I went shopping with some of my friend ...</td>\n",
       "      <td>2</td>\n",
       "      <td>[after, i, went, shopping, with, some, of, my,...</td>\n",
       "      <td>24</td>\n",
       "      <td>[went, shopping, friend, went, dodo, restauran...</td>\n",
       "      <td>11</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>43</td>\n",
       "      <td>I entered the restaurant and a waitress came b...</td>\n",
       "      <td>P</td>\n",
       "      <td>[I entered the restaurant and a waitress came ...</td>\n",
       "      <td>5</td>\n",
       "      <td>[i, entered, the, restaurant, and, a, waitress...</td>\n",
       "      <td>99</td>\n",
       "      <td>[entered, restaurant, waitress, came, blanking...</td>\n",
       "      <td>49</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>44</td>\n",
       "      <td>Carlos Plate Shack was the worst dining experi...</td>\n",
       "      <td>P</td>\n",
       "      <td>[Carlos Plate Shack was the worst dining exper...</td>\n",
       "      <td>9</td>\n",
       "      <td>[carlos, plate, shack, was, the, worst, dining...</td>\n",
       "      <td>155</td>\n",
       "      <td>[carlos, plate, shack, worst, dining, experien...</td>\n",
       "      <td>88</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45</td>\n",
       "      <td>Olive Oil Garden was very disappointing. I exp...</td>\n",
       "      <td>P</td>\n",
       "      <td>[Olive Oil Garden was very disappointing., I e...</td>\n",
       "      <td>5</td>\n",
       "      <td>[olive, oil, garden, was, very, disappointing,...</td>\n",
       "      <td>43</td>\n",
       "      <td>[olive, oil, garden, disappointing, expect, go...</td>\n",
       "      <td>23</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>92 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    0 PoN  \\\n",
       "0                                                   ?   N   \n",
       "1   Twin Trees Cicero NY HUGE salad bar and high q...   N   \n",
       "2   The worst restaurant that I have ever eaten in...   N   \n",
       "3                                                   ?   N   \n",
       "4   I have been to a Asian restaurant in New York ...   N   \n",
       "..                                                ...  ..   \n",
       "41  Mikes Pizza High Point NY Service was very slo...   P   \n",
       "42  After I went shopping with some of my friend w...   P   \n",
       "43  I entered the restaurant and a waitress came b...   P   \n",
       "44  Carlos Plate Shack was the worst dining experi...   P   \n",
       "45  Olive Oil Garden was very disappointing. I exp...   P   \n",
       "\n",
       "                                            sentences  num_sentences  \\\n",
       "0                                                 [?]              1   \n",
       "1   [Twin Trees Cicero NY HUGE salad bar and high ...              4   \n",
       "2   [The worst restaurant that I have ever eaten i...              5   \n",
       "3                                                 [?]              1   \n",
       "4   [I have been to a Asian restaurant in New York...              4   \n",
       "..                                                ...            ...   \n",
       "41  [Mikes Pizza High Point NY Service was very sl...              4   \n",
       "42  [After I went shopping with some of my friend ...              2   \n",
       "43  [I entered the restaurant and a waitress came ...              5   \n",
       "44  [Carlos Plate Shack was the worst dining exper...              9   \n",
       "45  [Olive Oil Garden was very disappointing., I e...              5   \n",
       "\n",
       "                                               tokens  num_tokens  \\\n",
       "0                                                  []           0   \n",
       "1   [twin, trees, cicero, ny, huge, salad, bar, an...          53   \n",
       "2   [the, worst, restaurant, that, i, have, ever, ...         105   \n",
       "3                                                  []           0   \n",
       "4   [i, have, been, to, a, asian, restaurant, in, ...          45   \n",
       "..                                                ...         ...   \n",
       "41  [mikes, pizza, high, point, ny, service, was, ...          43   \n",
       "42  [after, i, went, shopping, with, some, of, my,...          24   \n",
       "43  [i, entered, the, restaurant, and, a, waitress...          99   \n",
       "44  [carlos, plate, shack, was, the, worst, dining...         155   \n",
       "45  [olive, oil, garden, was, very, disappointing,...          43   \n",
       "\n",
       "                                                no_sw  num_no_sw  \n",
       "0                                                  []          0  \n",
       "1   [twin, trees, cicero, ny, huge, salad, bar, hi...         32  \n",
       "2   [worst, restaurant, ever, eaten, undoubtedly, ...         49  \n",
       "3                                                  []          0  \n",
       "4   [asian, restaurant, new, york, city, menu, wri...         23  \n",
       "..                                                ...        ...  \n",
       "41  [mikes, pizza, high, point, ny, service, slow,...         26  \n",
       "42  [went, shopping, friend, went, dodo, restauran...         11  \n",
       "43  [entered, restaurant, waitress, came, blanking...         49  \n",
       "44  [carlos, plate, shack, worst, dining, experien...         88  \n",
       "45  [olive, oil, garden, disappointing, expect, go...         23  \n",
       "\n",
       "[92 rows x 8 columns]"
      ]
     },
     "execution_count": 93,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 5: Create a Frequency Distribution"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 94,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.probability import FreqDist\n",
    "def get_most_common(tokens):\n",
    "    fdist = FreqDist(tokens)\n",
    "    return fdist.most_common(12)\n",
    "all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 95,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "def get_most_common(tokens):\n",
    "    fdist = FreqDist(tokens)\n",
    "    return fdist.most_common(12)\n",
    "all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 96,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_fdist(tokens):\n",
    "    return (FreqDist(tokens))\n",
    "    \n",
    "all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)\n",
    "all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>topwords_unfil</th>\n",
       "      <th>topwords_fil</th>\n",
       "      <th>freq_dist</th>\n",
       "      <th>freq_dist_unfil</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>{}</td>\n",
       "      <td>{}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>N</td>\n",
       "      <td>[Twin Trees Cicero NY HUGE salad bar and high ...</td>\n",
       "      <td>4</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, an...</td>\n",
       "      <td>53</td>\n",
       "      <td>[twin, trees, cicero, ny, huge, salad, bar, hi...</td>\n",
       "      <td>32</td>\n",
       "      <td>[(and, 3), (to, 3), (are, 2), (the, 2), (twin,...</td>\n",
       "      <td>[(twin, 1), (trees, 1), (cicero, 1), (ny, 1), ...</td>\n",
       "      <td>{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...</td>\n",
       "      <td>{'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>N</td>\n",
       "      <td>[The worst restaurant that I have ever eaten i...</td>\n",
       "      <td>5</td>\n",
       "      <td>[the, worst, restaurant, that, i, have, ever, ...</td>\n",
       "      <td>105</td>\n",
       "      <td>[worst, restaurant, ever, eaten, undoubtedly, ...</td>\n",
       "      <td>49</td>\n",
       "      <td>[(the, 6), (i, 6), (a, 5), (was, 5), (and, 4),...</td>\n",
       "      <td>[(pepper, 3), (veggie, 2), (sandwich, 2), (red...</td>\n",
       "      <td>{'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...</td>\n",
       "      <td>{'the': 6, 'worst': 1, 'restaurant': 1, 'that'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>?</td>\n",
       "      <td>N</td>\n",
       "      <td>[?]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>{}</td>\n",
       "      <td>{}</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>I have been to a Asian restaurant in New York ...</td>\n",
       "      <td>N</td>\n",
       "      <td>[I have been to a Asian restaurant in New York...</td>\n",
       "      <td>4</td>\n",
       "      <td>[i, have, been, to, a, asian, restaurant, in, ...</td>\n",
       "      <td>45</td>\n",
       "      <td>[asian, restaurant, new, york, city, menu, wri...</td>\n",
       "      <td>23</td>\n",
       "      <td>[(i, 3), (a, 3), (the, 2), (is, 2), (by, 2), (...</td>\n",
       "      <td>[(asian, 1), (restaurant, 1), (new, 1), (york,...</td>\n",
       "      <td>{'asian': 1, 'restaurant': 1, 'new': 1, 'york'...</td>\n",
       "      <td>{'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>41</td>\n",
       "      <td>Mikes Pizza High Point NY Service was very slo...</td>\n",
       "      <td>P</td>\n",
       "      <td>[Mikes Pizza High Point NY Service was very sl...</td>\n",
       "      <td>4</td>\n",
       "      <td>[mikes, pizza, high, point, ny, service, was, ...</td>\n",
       "      <td>43</td>\n",
       "      <td>[mikes, pizza, high, point, ny, service, slow,...</td>\n",
       "      <td>26</td>\n",
       "      <td>[(pizza, 2), (was, 2), (you, 2), (would, 2), (...</td>\n",
       "      <td>[(pizza, 2), (would, 2), (mikes, 1), (high, 1)...</td>\n",
       "      <td>{'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1...</td>\n",
       "      <td>{'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>42</td>\n",
       "      <td>After I went shopping with some of my friend w...</td>\n",
       "      <td>P</td>\n",
       "      <td>[After I went shopping with some of my friend ...</td>\n",
       "      <td>2</td>\n",
       "      <td>[after, i, went, shopping, with, some, of, my,...</td>\n",
       "      <td>24</td>\n",
       "      <td>[went, shopping, friend, went, dodo, restauran...</td>\n",
       "      <td>11</td>\n",
       "      <td>[(i, 2), (went, 2), (of, 2), (after, 1), (shop...</td>\n",
       "      <td>[(went, 2), (shopping, 1), (friend, 1), (dodo,...</td>\n",
       "      <td>{'went': 2, 'shopping': 1, 'friend': 1, 'dodo'...</td>\n",
       "      <td>{'after': 1, 'i': 2, 'went': 2, 'shopping': 1,...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>43</td>\n",
       "      <td>I entered the restaurant and a waitress came b...</td>\n",
       "      <td>P</td>\n",
       "      <td>[I entered the restaurant and a waitress came ...</td>\n",
       "      <td>5</td>\n",
       "      <td>[i, entered, the, restaurant, and, a, waitress...</td>\n",
       "      <td>99</td>\n",
       "      <td>[entered, restaurant, waitress, came, blanking...</td>\n",
       "      <td>49</td>\n",
       "      <td>[(the, 9), (i, 6), (and, 6), (to, 4), (a, 2), ...</td>\n",
       "      <td>[(waitress, 2), (waited, 2), (even, 2), (food,...</td>\n",
       "      <td>{'entered': 1, 'restaurant': 1, 'waitress': 2,...</td>\n",
       "      <td>{'i': 6, 'entered': 1, 'the': 9, 'restaurant':...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>44</td>\n",
       "      <td>Carlos Plate Shack was the worst dining experi...</td>\n",
       "      <td>P</td>\n",
       "      <td>[Carlos Plate Shack was the worst dining exper...</td>\n",
       "      <td>9</td>\n",
       "      <td>[carlos, plate, shack, was, the, worst, dining...</td>\n",
       "      <td>155</td>\n",
       "      <td>[carlos, plate, shack, worst, dining, experien...</td>\n",
       "      <td>88</td>\n",
       "      <td>[(the, 9), (to, 7), (plate, 6), (and, 5), (my,...</td>\n",
       "      <td>[(plate, 6), (southern, 3), (comfort, 3), (ext...</td>\n",
       "      <td>{'carlos': 1, 'plate': 6, 'shack': 1, 'worst':...</td>\n",
       "      <td>{'carlos': 1, 'plate': 6, 'shack': 1, 'was': 3...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>45</td>\n",
       "      <td>Olive Oil Garden was very disappointing. I exp...</td>\n",
       "      <td>P</td>\n",
       "      <td>[Olive Oil Garden was very disappointing., I e...</td>\n",
       "      <td>5</td>\n",
       "      <td>[olive, oil, garden, was, very, disappointing,...</td>\n",
       "      <td>43</td>\n",
       "      <td>[olive, oil, garden, disappointing, expect, go...</td>\n",
       "      <td>23</td>\n",
       "      <td>[(the, 3), (olive, 2), (oil, 2), (garden, 2), ...</td>\n",
       "      <td>[(olive, 2), (oil, 2), (garden, 2), (good, 2),...</td>\n",
       "      <td>{'olive': 2, 'oil': 2, 'garden': 2, 'disappoin...</td>\n",
       "      <td>{'olive': 2, 'oil': 2, 'garden': 2, 'was': 2, ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>92 rows × 12 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                    0 PoN  \\\n",
       "0                                                   ?   N   \n",
       "1   Twin Trees Cicero NY HUGE salad bar and high q...   N   \n",
       "2   The worst restaurant that I have ever eaten in...   N   \n",
       "3                                                   ?   N   \n",
       "4   I have been to a Asian restaurant in New York ...   N   \n",
       "..                                                ...  ..   \n",
       "41  Mikes Pizza High Point NY Service was very slo...   P   \n",
       "42  After I went shopping with some of my friend w...   P   \n",
       "43  I entered the restaurant and a waitress came b...   P   \n",
       "44  Carlos Plate Shack was the worst dining experi...   P   \n",
       "45  Olive Oil Garden was very disappointing. I exp...   P   \n",
       "\n",
       "                                            sentences  num_sentences  \\\n",
       "0                                                 [?]              1   \n",
       "1   [Twin Trees Cicero NY HUGE salad bar and high ...              4   \n",
       "2   [The worst restaurant that I have ever eaten i...              5   \n",
       "3                                                 [?]              1   \n",
       "4   [I have been to a Asian restaurant in New York...              4   \n",
       "..                                                ...            ...   \n",
       "41  [Mikes Pizza High Point NY Service was very sl...              4   \n",
       "42  [After I went shopping with some of my friend ...              2   \n",
       "43  [I entered the restaurant and a waitress came ...              5   \n",
       "44  [Carlos Plate Shack was the worst dining exper...              9   \n",
       "45  [Olive Oil Garden was very disappointing., I e...              5   \n",
       "\n",
       "                                               tokens  num_tokens  \\\n",
       "0                                                  []           0   \n",
       "1   [twin, trees, cicero, ny, huge, salad, bar, an...          53   \n",
       "2   [the, worst, restaurant, that, i, have, ever, ...         105   \n",
       "3                                                  []           0   \n",
       "4   [i, have, been, to, a, asian, restaurant, in, ...          45   \n",
       "..                                                ...         ...   \n",
       "41  [mikes, pizza, high, point, ny, service, was, ...          43   \n",
       "42  [after, i, went, shopping, with, some, of, my,...          24   \n",
       "43  [i, entered, the, restaurant, and, a, waitress...          99   \n",
       "44  [carlos, plate, shack, was, the, worst, dining...         155   \n",
       "45  [olive, oil, garden, was, very, disappointing,...          43   \n",
       "\n",
       "                                                no_sw  num_no_sw  \\\n",
       "0                                                  []          0   \n",
       "1   [twin, trees, cicero, ny, huge, salad, bar, hi...         32   \n",
       "2   [worst, restaurant, ever, eaten, undoubtedly, ...         49   \n",
       "3                                                  []          0   \n",
       "4   [asian, restaurant, new, york, city, menu, wri...         23   \n",
       "..                                                ...        ...   \n",
       "41  [mikes, pizza, high, point, ny, service, slow,...         26   \n",
       "42  [went, shopping, friend, went, dodo, restauran...         11   \n",
       "43  [entered, restaurant, waitress, came, blanking...         49   \n",
       "44  [carlos, plate, shack, worst, dining, experien...         88   \n",
       "45  [olive, oil, garden, disappointing, expect, go...         23   \n",
       "\n",
       "                                       topwords_unfil  \\\n",
       "0                                                  []   \n",
       "1   [(and, 3), (to, 3), (are, 2), (the, 2), (twin,...   \n",
       "2   [(the, 6), (i, 6), (a, 5), (was, 5), (and, 4),...   \n",
       "3                                                  []   \n",
       "4   [(i, 3), (a, 3), (the, 2), (is, 2), (by, 2), (...   \n",
       "..                                                ...   \n",
       "41  [(pizza, 2), (was, 2), (you, 2), (would, 2), (...   \n",
       "42  [(i, 2), (went, 2), (of, 2), (after, 1), (shop...   \n",
       "43  [(the, 9), (i, 6), (and, 6), (to, 4), (a, 2), ...   \n",
       "44  [(the, 9), (to, 7), (plate, 6), (and, 5), (my,...   \n",
       "45  [(the, 3), (olive, 2), (oil, 2), (garden, 2), ...   \n",
       "\n",
       "                                         topwords_fil  \\\n",
       "0                                                  []   \n",
       "1   [(twin, 1), (trees, 1), (cicero, 1), (ny, 1), ...   \n",
       "2   [(pepper, 3), (veggie, 2), (sandwich, 2), (red...   \n",
       "3                                                  []   \n",
       "4   [(asian, 1), (restaurant, 1), (new, 1), (york,...   \n",
       "..                                                ...   \n",
       "41  [(pizza, 2), (would, 2), (mikes, 1), (high, 1)...   \n",
       "42  [(went, 2), (shopping, 1), (friend, 1), (dodo,...   \n",
       "43  [(waitress, 2), (waited, 2), (even, 2), (food,...   \n",
       "44  [(plate, 6), (southern, 3), (comfort, 3), (ext...   \n",
       "45  [(olive, 2), (oil, 2), (garden, 2), (good, 2),...   \n",
       "\n",
       "                                            freq_dist  \\\n",
       "0                                                  {}   \n",
       "1   {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...   \n",
       "2   {'worst': 1, 'restaurant': 1, 'ever': 1, 'eate...   \n",
       "3                                                  {}   \n",
       "4   {'asian': 1, 'restaurant': 1, 'new': 1, 'york'...   \n",
       "..                                                ...   \n",
       "41  {'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1...   \n",
       "42  {'went': 2, 'shopping': 1, 'friend': 1, 'dodo'...   \n",
       "43  {'entered': 1, 'restaurant': 1, 'waitress': 2,...   \n",
       "44  {'carlos': 1, 'plate': 6, 'shack': 1, 'worst':...   \n",
       "45  {'olive': 2, 'oil': 2, 'garden': 2, 'disappoin...   \n",
       "\n",
       "                                      freq_dist_unfil  \n",
       "0                                                  {}  \n",
       "1   {'twin': 1, 'trees': 1, 'cicero': 1, 'ny': 1, ...  \n",
       "2   {'the': 6, 'worst': 1, 'restaurant': 1, 'that'...  \n",
       "3                                                  {}  \n",
       "4   {'i': 3, 'have': 1, 'been': 1, 'to': 1, 'a': 3...  \n",
       "..                                                ...  \n",
       "41  {'mikes': 1, 'pizza': 2, 'high': 1, 'point': 1...  \n",
       "42  {'after': 1, 'i': 2, 'went': 2, 'shopping': 1,...  \n",
       "43  {'i': 6, 'entered': 1, 'the': 9, 'restaurant':...  \n",
       "44  {'carlos': 1, 'plate': 6, 'shack': 1, 'was': 3...  \n",
       "45  {'olive': 2, 'oil': 2, 'garden': 2, 'was': 2, ...  \n",
       "\n",
       "[92 rows x 12 columns]"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 6: Try Different Sentiment Analysis Tools"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### VADER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 98,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "sid = SentimentIntensityAnalyzer()\n",
    "def get_vader_score(review):\n",
    "    return sid.polarity_scores(review)\n",
    "\n",
    "all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 99,
   "metadata": {},
   "outputs": [],
   "source": [
    "def separate_vader_score(vader_score, key):\n",
    "    return vader_score[key]\n",
    "\n",
    "all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)\n",
    "all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)\n",
    "all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)\n",
    "all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DIY SUMMARY"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "17    Halos is home. I have been here numerous times...\n",
       "17    I went to Joeys and had the best lasagna on th...\n",
       "Name: 0, dtype: object"
      ]
     },
     "execution_count": 100,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[0][17]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_weighted_freq_dist(review, freq_dist):\n",
    "    try:\n",
    "        max_freq = max(freq_dist.values())\n",
    "        for word in freq_dist.keys():\n",
    "            freq_dist[word] = (freq_dist[word]/max_freq)\n",
    "        return freq_dist\n",
    "    except:\n",
    "        return 'nope'\n",
    "\n",
    "all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_sentence_score(review, freq_dist):\n",
    "    sentence_scores = {}\n",
    "    for sent in review:\n",
    "        for word in nltk.word_tokenize(sent.lower()):\n",
    "            if word in freq_dist.keys():\n",
    "                if len(sent.split(' ')) < 30:\n",
    "                    if sent not in sentence_scores.keys():\n",
    "                        sentence_scores[sent] = freq_dist[word]\n",
    "                    else:\n",
    "                        sentence_scores[sent] += freq_dist[word]\n",
    "    return sentence_scores\n",
    "\n",
    "all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_summary_sentences(sentence_scores):\n",
    "    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)\n",
    "    return ''.join(sent[0] for sent in sorted_sentences[:5])\n",
    "\n",
    "all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    "summaries = all_df['summary_sentences'].tolist()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "''"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "summaries[3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Doing VADER on the Summary Section"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)\n",
    "all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)\n",
    "all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)\n",
    "all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Doing VADER on the Most Frequent Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_freq_words(freq_dist):\n",
    "    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)\n",
    "    return ' '.join(word[0] for word in sorted_words[:50])\n",
    "\n",
    "all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)\n",
    "\n",
    "all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)\n",
    "all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)\n",
    "all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)\n",
    "all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)\n",
    "all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 7: Test `Step 6` with Machine Learning!!\n",
    "### Naive Bayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "def get_NB(small_df, labels):\n",
    "    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n",
    "\n",
    "    gnb = GaussianNB()\n",
    "    gnb.fit(x_train, y_train)\n",
    "    y_pred = gnb.predict(x_test)\n",
    "    from sklearn import metrics\n",
    "    print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### TEST 1: Vader Scores (Original)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu']) # 0.645\n",
    "get_NB(small_df, all_df['PoN'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### TEST 2: Vader Scores (from Summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.6071428571428571\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) # 0.59\n",
    "get_NB(small_df, all_df['PoN'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### TEST 3: Vader Scores (original) AND Vader Scores (summary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5714285714285714\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n",
    "                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618\n",
    "get_NB(small_df, all_df['PoN'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### TEST 4: Vader Scores (50 most frequent -- filtered -- words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.6428571428571429\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd']) # 0.598\n",
    "get_NB(small_df, all_df['PoN'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### TEST 5: All `compound` Vader Scores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.6071428571428571\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_fd','v_compound_sum', 'v_compound']) # 0.615\n",
    "get_NB(small_df, all_df['PoN'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### TEST 6: ALL THE NUMBERS!!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 115,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.6071428571428571\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n",
    "                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', \n",
    "                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.613\n",
    "get_NB(small_df, all_df['PoN'])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### TEST 7: Test UNFILTERED most frequent words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 116,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_freq_words(freq_dist):\n",
    "    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)\n",
    "    return ' '.join(word[0] for word in sorted_words[:50])\n",
    "\n",
    "all_df['v_freq_words_unfil'] = all_df.apply(lambda x: get_freq_words(x['freq_dist_unfil']), axis=1)\n",
    "\n",
    "all_df['vader_fd_all_unfil'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words_unfil']),axis=1)\n",
    "\n",
    "all_df['v_compound_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'compound'),axis=1)\n",
    "all_df['v_neg_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neg'),axis=1)\n",
    "all_df['v_neu_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neu'),axis=1)\n",
    "all_df['v_pos_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'pos'),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.6071428571428571\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n",
    "                          'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', \n",
    "                          'v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf',\n",
    "                          'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618\n",
    "get_NB(small_df, all_df['PoN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 118,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5357142857142857\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf']) # 0.603\n",
    "get_NB(small_df, all_df['PoN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "summaries_pos = all_df[all_df['PoN'] == 'P']\n",
    "summaries_neg = all_df[all_df['PoN'] == 'N']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "summaries_pos_list = summaries_pos['summary_sentences'].tolist()\n",
    "summaries_neg_list = summaries_neg['summary_sentences'].tolist()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 8: Test NLTK: Naive Bayes from HW1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.classify import NaiveBayesClassifier\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *\n",
    "\n",
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "def get_nltk_train_test(array, label, num_train):\n",
    "    tokens = [get_tokens(sentence) for sentence in array]\n",
    "    docs = [(sent, label) for sent in tokens]\n",
    "    train_docs = docs[:num_train]\n",
    "    test_docs = docs[num_train:len(array)]\n",
    "    return [train_docs, test_docs]\n",
    "\n",
    "\n",
    "def get_nltk_NB(NEG_DATA, POS_DATA, num_train):\n",
    "    train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg', num_train)\n",
    "    train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos', num_train)\n",
    "\n",
    "    training_docs = train_neg + train_pos\n",
    "    testing_docs = test_neg + test_pos\n",
    "\n",
    "    sentim_analyzer = SentimentAnalyzer()\n",
    "    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])\n",
    "    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)\n",
    "    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)\n",
    "    training_set = sentim_analyzer.apply_features(training_docs)\n",
    "    test_set = sentim_analyzer.apply_features(testing_docs)\n",
    "\n",
    "    trainer = NaiveBayesClassifier.train\n",
    "    classifier = sentim_analyzer.train(trainer, training_set)\n",
    "    \n",
    "    results = []\n",
    "    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):\n",
    "        print('{0}: {1}'.format(key,value))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 136,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.5714285714285714\n",
      "F-measure [neg]: 0.5714285714285714\n",
      "F-measure [pos]: 0.5714285714285714\n",
      "Precision [neg]: 0.5714285714285714\n",
      "Precision [pos]: 0.5714285714285714\n",
      "Recall [neg]: 0.5714285714285714\n",
      "Recall [pos]: 0.5714285714285714\n"
     ]
    }
   ],
   "source": [
    "neg_df = all_df[all_df['PoN'] == 'N']\n",
    "neg_df_list = neg_df[0].tolist()\n",
    "\n",
    "pos_df = all_df[all_df['PoN'] == 'P']\n",
    "pos_df_list = pos_df[0].tolist()\n",
    "\n",
    "import math\n",
    "\n",
    "percent_train = 0.7 if (len(pos_df) < 200) else 0.8\n",
    "train_size = math.floor(len(pos_df)*percent_train)\n",
    "get_nltk_NB(neg_df_list, pos_df_list, train_size)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
