{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentiment Analysis\n",
    "## TextBlob + Vader + NLTK + Naive Bayes\n",
    "via [this tutorial](https://levelup.gitconnected.com/sentiment-analysis-using-machine-learning-python-9122e03f8f7b) |10-6-19"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "from textblob import TextBlob\n",
    "from IPython.display import display, HTML\n",
    "import os\n",
    "import pandas as pd\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "sid = SentimentIntensityAnalyzer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file)\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "neg_k = get_data_from_files('AI_NEG/')\n",
    "pos_k = get_data_from_files('AI_POS/')\n",
    "neg_a = get_data_from_files('NEG/')\n",
    "pos_a = get_data_from_files('POS/')\n",
    "neg_cornell = get_data_from_files('neg_cornell/')\n",
    "pos_cornell = get_data_from_files('pos_cornell/')\n",
    "neg_dirty = get_data_from_files('NEG_dirty/')\n",
    "pos_dirty = get_data_from_files('POS_dirty/')\n",
    "neg_joker = get_data_from_files('NEG_JK/')\n",
    "pos_joker = get_data_from_files('POS_JK/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TEXT BLOB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pn(num):\n",
    "    return 'neg' if num < 0 else 'pos'\n",
    "\n",
    "def get_sentiment(array, label):\n",
    "    blobs = [[TextBlob(text), text] for text in array]\n",
    "    return ([{'label': label,\n",
    "              'prediction': get_pn(obj.sentiment.polarity),\n",
    "              'sentiment': obj.sentiment.polarity,\n",
    "              'length': len(text), \n",
    "              'excerpt': text[:50]} for obj,text in blobs])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 1: Kendra's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.157143</td>\n",
       "      <td>76</td>\n",
       "      <td>WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.750000</td>\n",
       "      <td>96</td>\n",
       "      <td>How can we trust Artificial Intelligence to dr...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.775000</td>\n",
       "      <td>31</td>\n",
       "      <td>I hate artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.750000</td>\n",
       "      <td>47</td>\n",
       "      <td>My dog is terrified by artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.750000</td>\n",
       "      <td>68</td>\n",
       "      <td>Artificial intelligence is going to melt the b...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   neg        neg  -0.157143      76   \n",
       "1   neg        neg  -0.750000      96   \n",
       "2   neg        neg  -0.775000      31   \n",
       "3   neg        neg  -0.750000      47   \n",
       "4   neg        neg  -0.750000      68   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...      yes  \n",
       "1  How can we trust Artificial Intelligence to dr...      yes  \n",
       "2                    I hate artificial intelligence!      yes  \n",
       "3    My dog is terrified by artificial intelligence!      yes  \n",
       "4  Artificial intelligence is going to melt the b...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.112500</td>\n",
       "      <td>65</td>\n",
       "      <td>My dog is excited by the advancements in artif...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.075000</td>\n",
       "      <td>133</td>\n",
       "      <td>I'm excited for my child to grow up and have t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.125000</td>\n",
       "      <td>31</td>\n",
       "      <td>I love artificial intelligence!</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.300000</td>\n",
       "      <td>121</td>\n",
       "      <td>Order my groceries, pay my taxes, take my kids...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.133333</td>\n",
       "      <td>116</td>\n",
       "      <td>I'm grateful every day that my child will like...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   pos        neg  -0.112500      65   \n",
       "1   pos        neg  -0.075000     133   \n",
       "2   pos        neg  -0.125000      31   \n",
       "3   pos        neg  -0.300000     121   \n",
       "4   pos        neg  -0.133333     116   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  My dog is excited by the advancements in artif...       no  \n",
       "1  I'm excited for my child to grow up and have t...       no  \n",
       "2                    I love artificial intelligence!       no  \n",
       "3  Order my groceries, pay my taxes, take my kids...       no  \n",
       "4  I'm grateful every day that my child will like...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 5\n",
      "CORRECT PREDICT POS: 0\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_k, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_k, 'pos'))\n",
    "\n",
    "import numpy as np\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 2: Ami's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.054577</td>\n",
       "      <td>3554</td>\n",
       "      <td>that's exactly how long the movie felt to me ....</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.025467</td>\n",
       "      <td>2929</td>\n",
       "      <td>\" quest for camelot \" is warner bros . ' firs...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.003334</td>\n",
       "      <td>3365</td>\n",
       "      <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.022925</td>\n",
       "      <td>4418</td>\n",
       "      <td>synopsis : a mentally unstable man undergoing ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.043234</td>\n",
       "      <td>3911</td>\n",
       "      <td>capsule : in 2176 on the planet mars police ta...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   neg        neg  -0.054577    3554   \n",
       "1   neg        pos   0.025467    2929   \n",
       "2   neg        pos   0.003334    3365   \n",
       "3   neg        pos   0.022925    4418   \n",
       "4   neg        pos   0.043234    3911   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  that's exactly how long the movie felt to me ....      yes  \n",
       "1   \" quest for camelot \" is warner bros . ' firs...       no  \n",
       "2  so ask yourself what \" 8mm \" ( \" eight millime...       no  \n",
       "3  synopsis : a mentally unstable man undergoing ...       no  \n",
       "4  capsule : in 2176 on the planet mars police ta...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.023663</td>\n",
       "      <td>4227</td>\n",
       "      <td>films adapted from comic books have had plenty...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.131092</td>\n",
       "      <td>2421</td>\n",
       "      <td>you've got mail works alot better than it dese...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.110626</td>\n",
       "      <td>6092</td>\n",
       "      <td>\" jaws \" is a rare film that grabs your atten...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.103847</td>\n",
       "      <td>4096</td>\n",
       "      <td>every now and then a movie comes along from a ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.070151</td>\n",
       "      <td>3898</td>\n",
       "      <td>moviemaking is a lot like being the general ma...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   pos        pos   0.023663    4227   \n",
       "1   pos        pos   0.131092    2421   \n",
       "2   pos        pos   0.110626    6092   \n",
       "3   pos        pos   0.103847    4096   \n",
       "4   pos        neg  -0.070151    3898   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  films adapted from comic books have had plenty...      yes  \n",
       "1  you've got mail works alot better than it dese...      yes  \n",
       "2   \" jaws \" is a rare film that grabs your atten...      yes  \n",
       "3  every now and then a movie comes along from a ...      yes  \n",
       "4  moviemaking is a lot like being the general ma...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 1\n",
      "CORRECT PREDICT POS: 4\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_a, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_a, 'pos'))\n",
    "\n",
    "import numpy as np\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 3: Cornell Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.026240</td>\n",
       "      <td>5953</td>\n",
       "      <td>bad . bad . \\nbad . \\nthat one word seems to p...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.076040</td>\n",
       "      <td>3396</td>\n",
       "      <td>isn't it the ultimate sign of a movie's cinema...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.128733</td>\n",
       "      <td>2762</td>\n",
       "      <td>\" gordy \" is not a movie , it is a 90-minute-...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.000485</td>\n",
       "      <td>3840</td>\n",
       "      <td>disconnect the phone line . \\ndon't accept the...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.122770</td>\n",
       "      <td>2270</td>\n",
       "      <td>when robert forster found himself famous again...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.145489</td>\n",
       "      <td>1945</td>\n",
       "      <td>synopsis : when a meteorite crashlands in the ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.102723</td>\n",
       "      <td>3116</td>\n",
       "      <td>it's now the anniversary of the slayings of ju...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.042473</td>\n",
       "      <td>1755</td>\n",
       "      <td>coinciding with the emerging popularity of mov...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.048656</td>\n",
       "      <td>2826</td>\n",
       "      <td>and now the high-flying hong kong style of fil...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.090655</td>\n",
       "      <td>4165</td>\n",
       "      <td>battlefield long , boring and just plain stupi...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  sentiment  length  \\\n",
       "0     neg        pos   0.026240    5953   \n",
       "1     neg        pos   0.076040    3396   \n",
       "2     neg        neg  -0.128733    2762   \n",
       "3     neg        neg  -0.000485    3840   \n",
       "4     neg        pos   0.122770    2270   \n",
       "..    ...        ...        ...     ...   \n",
       "995   neg        pos   0.145489    1945   \n",
       "996   neg        pos   0.102723    3116   \n",
       "997   neg        pos   0.042473    1755   \n",
       "998   neg        neg  -0.048656    2826   \n",
       "999   neg        neg  -0.090655    4165   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0    bad . bad . \\nbad . \\nthat one word seems to p...       no  \n",
       "1    isn't it the ultimate sign of a movie's cinema...       no  \n",
       "2     \" gordy \" is not a movie , it is a 90-minute-...      yes  \n",
       "3    disconnect the phone line . \\ndon't accept the...      yes  \n",
       "4    when robert forster found himself famous again...       no  \n",
       "..                                                 ...      ...  \n",
       "995  synopsis : when a meteorite crashlands in the ...       no  \n",
       "996  it's now the anniversary of the slayings of ju...       no  \n",
       "997  coinciding with the emerging popularity of mov...       no  \n",
       "998  and now the high-flying hong kong style of fil...      yes  \n",
       "999  battlefield long , boring and just plain stupi...      yes  \n",
       "\n",
       "[1000 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.221173</td>\n",
       "      <td>4662</td>\n",
       "      <td>assume nothing . \\nthe phrase is perhaps one o...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.089736</td>\n",
       "      <td>3839</td>\n",
       "      <td>plot : derek zoolander is a male model . \\nhe ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.206743</td>\n",
       "      <td>9380</td>\n",
       "      <td>i actually am a fan of the original 1961 or so...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.141905</td>\n",
       "      <td>2407</td>\n",
       "      <td>a movie that's been as highly built up as the ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.176332</td>\n",
       "      <td>1840</td>\n",
       "      <td>\" good will hunting \" is two movies in one : ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.072815</td>\n",
       "      <td>2658</td>\n",
       "      <td>one of the funniest carry on movies and the th...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.102879</td>\n",
       "      <td>4196</td>\n",
       "      <td>i remember making a pact , right after `patch ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.195097</td>\n",
       "      <td>2094</td>\n",
       "      <td>barely scrapping by playing at a nyc piano bar...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.117530</td>\n",
       "      <td>4575</td>\n",
       "      <td>if the current trends of hollywood filmmaking ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.013569</td>\n",
       "      <td>3870</td>\n",
       "      <td>capsule : the director of cure brings a weird ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  sentiment  length  \\\n",
       "0     pos        pos   0.221173    4662   \n",
       "1     pos        pos   0.089736    3839   \n",
       "2     pos        pos   0.206743    9380   \n",
       "3     pos        pos   0.141905    2407   \n",
       "4     pos        pos   0.176332    1840   \n",
       "..    ...        ...        ...     ...   \n",
       "995   pos        pos   0.072815    2658   \n",
       "996   pos        pos   0.102879    4196   \n",
       "997   pos        pos   0.195097    2094   \n",
       "998   pos        pos   0.117530    4575   \n",
       "999   pos        neg  -0.013569    3870   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0    assume nothing . \\nthe phrase is perhaps one o...      yes  \n",
       "1    plot : derek zoolander is a male model . \\nhe ...      yes  \n",
       "2    i actually am a fan of the original 1961 or so...      yes  \n",
       "3    a movie that's been as highly built up as the ...      yes  \n",
       "4     \" good will hunting \" is two movies in one : ...      yes  \n",
       "..                                                 ...      ...  \n",
       "995  one of the funniest carry on movies and the th...      yes  \n",
       "996  i remember making a pact , right after `patch ...      yes  \n",
       "997  barely scrapping by playing at a nyc piano bar...      yes  \n",
       "998  if the current trends of hollywood filmmaking ...      yes  \n",
       "999  capsule : the director of cure brings a weird ...       no  \n",
       "\n",
       "[1000 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 229\n",
      "CORRECT PREDICT POS: 971\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_cornell, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_cornell, 'pos'))\n",
    "\n",
    "import numpy as np\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 4: Dirty Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.004665</td>\n",
       "      <td>3777</td>\n",
       "      <td>by starring in amy heckerlings  clueless  two ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.119184</td>\n",
       "      <td>3639</td>\n",
       "      <td>i have little against remakes and updates of o...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.100886</td>\n",
       "      <td>4247</td>\n",
       "      <td>i cant recall a previous film experience where...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.097526</td>\n",
       "      <td>4308</td>\n",
       "      <td>the tagline for this film is :  some houses ar...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.048745</td>\n",
       "      <td>5175</td>\n",
       "      <td>warner brothers ; rated pg-13 ( mild violence ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.014624</td>\n",
       "      <td>4086</td>\n",
       "      <td>`the bachelor is one of the best terrible movi...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.035911</td>\n",
       "      <td>3741</td>\n",
       "      <td>as a hot-shot defense attorney , kevin lomax (...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.101395</td>\n",
       "      <td>2890</td>\n",
       "      <td>violence is bad . violence is ugly . violence ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.088523</td>\n",
       "      <td>4089</td>\n",
       "      <td>even though i have the utmost respect for rich...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.074695</td>\n",
       "      <td>2433</td>\n",
       "      <td>an attempt at florida film noir , palmetto fai...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  sentiment  length  \\\n",
       "0     neg        neg  -0.004665    3777   \n",
       "1     neg        pos   0.119184    3639   \n",
       "2     neg        pos   0.100886    4247   \n",
       "3     neg        pos   0.097526    4308   \n",
       "4     neg        pos   0.048745    5175   \n",
       "..    ...        ...        ...     ...   \n",
       "995   neg        pos   0.014624    4086   \n",
       "996   neg        pos   0.035911    3741   \n",
       "997   neg        pos   0.101395    2890   \n",
       "998   neg        pos   0.088523    4089   \n",
       "999   neg        pos   0.074695    2433   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0    by starring in amy heckerlings  clueless  two ...      yes  \n",
       "1    i have little against remakes and updates of o...       no  \n",
       "2    i cant recall a previous film experience where...       no  \n",
       "3    the tagline for this film is :  some houses ar...       no  \n",
       "4    warner brothers ; rated pg-13 ( mild violence ...       no  \n",
       "..                                                 ...      ...  \n",
       "995  `the bachelor is one of the best terrible movi...       no  \n",
       "996  as a hot-shot defense attorney , kevin lomax (...       no  \n",
       "997  violence is bad . violence is ugly . violence ...       no  \n",
       "998  even though i have the utmost respect for rich...       no  \n",
       "999  an attempt at florida film noir , palmetto fai...       no  \n",
       "\n",
       "[1000 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.134641</td>\n",
       "      <td>4584</td>\n",
       "      <td>for the first reel of girls town , you just ca...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.137134</td>\n",
       "      <td>3102</td>\n",
       "      <td>field of dreams almost defies description . al...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.181355</td>\n",
       "      <td>3521</td>\n",
       "      <td>meet joe black is your classic boy-meets-girl ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.104101</td>\n",
       "      <td>2192</td>\n",
       "      <td>an indian runner was more than a courier . he ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.204967</td>\n",
       "      <td>4955</td>\n",
       "      <td>every once in a while , when an exceptional fa...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.118713</td>\n",
       "      <td>4929</td>\n",
       "      <td>the laserman : somehow the title of writer-dir...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.150425</td>\n",
       "      <td>4264</td>\n",
       "      <td>i know what you did last summer ,  the first...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.121243</td>\n",
       "      <td>2374</td>\n",
       "      <td>buffalo ? 66 is a very rarely known movie that...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.130603</td>\n",
       "      <td>2508</td>\n",
       "      <td>time bandits , from director terry gilliam , i...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.011179</td>\n",
       "      <td>5355</td>\n",
       "      <td>warren beattys  bulworth  is a caustic politic...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  sentiment  length  \\\n",
       "0     pos        pos   0.134641    4584   \n",
       "1     pos        pos   0.137134    3102   \n",
       "2     pos        pos   0.181355    3521   \n",
       "3     pos        pos   0.104101    2192   \n",
       "4     pos        pos   0.204967    4955   \n",
       "..    ...        ...        ...     ...   \n",
       "995   pos        pos   0.118713    4929   \n",
       "996   pos        pos   0.150425    4264   \n",
       "997   pos        pos   0.121243    2374   \n",
       "998   pos        pos   0.130603    2508   \n",
       "999   pos        pos   0.011179    5355   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0    for the first reel of girls town , you just ca...      yes  \n",
       "1    field of dreams almost defies description . al...      yes  \n",
       "2    meet joe black is your classic boy-meets-girl ...      yes  \n",
       "3    an indian runner was more than a courier . he ...      yes  \n",
       "4    every once in a while , when an exceptional fa...      yes  \n",
       "..                                                 ...      ...  \n",
       "995  the laserman : somehow the title of writer-dir...      yes  \n",
       "996    i know what you did last summer ,  the first...      yes  \n",
       "997  buffalo ? 66 is a very rarely known movie that...      yes  \n",
       "998  time bandits , from director terry gilliam , i...      yes  \n",
       "999  warren beattys  bulworth  is a caustic politic...      yes  \n",
       "\n",
       "[1000 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 227\n",
      "CORRECT PREDICT POS: 972\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_dirty, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_dirty, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 5: Joker Review Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.152083</td>\n",
       "      <td>1734</td>\n",
       "      <td>Missed Opportunity\\nI had been very excited t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.001852</td>\n",
       "      <td>328</td>\n",
       "      <td>5/5 for Phoenix's acting..\\nI don't think the...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>145</td>\n",
       "      <td>Everyone praised an overrated movie.\\nOverrat...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.038095</td>\n",
       "      <td>350</td>\n",
       "      <td>What idiotic FIlm\\nI can say that Phoenix is ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.126398</td>\n",
       "      <td>711</td>\n",
       "      <td>Terrible\\nThe only thing good about this movi...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>118</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.290909</td>\n",
       "      <td>432</td>\n",
       "      <td>Boring and disappointing 😣\\nGreat job acting ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>119</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.164710</td>\n",
       "      <td>853</td>\n",
       "      <td>A masterclass in acting nothing more\\nI don't...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>120</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.126667</td>\n",
       "      <td>242</td>\n",
       "      <td>Not equal to the sum of its parts.\\nDespite a...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>121</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.187500</td>\n",
       "      <td>128</td>\n",
       "      <td>Not real Joker\\nThis movie is poorly done as ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>122</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.057436</td>\n",
       "      <td>1212</td>\n",
       "      <td>HAH HAAH HAAAH HAAAAH HAAAAAH HAAAAAAH HAAAAA...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>123 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  sentiment  length  \\\n",
       "0     neg        pos   0.152083    1734   \n",
       "1     neg        neg  -0.001852     328   \n",
       "2     neg        pos   0.200000     145   \n",
       "3     neg        neg  -0.038095     350   \n",
       "4     neg        pos   0.126398     711   \n",
       "..    ...        ...        ...     ...   \n",
       "118   neg        neg  -0.290909     432   \n",
       "119   neg        pos   0.164710     853   \n",
       "120   neg        pos   0.126667     242   \n",
       "121   neg        neg  -0.187500     128   \n",
       "122   neg        neg  -0.057436    1212   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0     Missed Opportunity\\nI had been very excited t...       no  \n",
       "1     5/5 for Phoenix's acting..\\nI don't think the...      yes  \n",
       "2     Everyone praised an overrated movie.\\nOverrat...       no  \n",
       "3     What idiotic FIlm\\nI can say that Phoenix is ...      yes  \n",
       "4     Terrible\\nThe only thing good about this movi...       no  \n",
       "..                                                 ...      ...  \n",
       "118   Boring and disappointing 😣\\nGreat job acting ...      yes  \n",
       "119   A masterclass in acting nothing more\\nI don't...       no  \n",
       "120   Not equal to the sum of its parts.\\nDespite a...       no  \n",
       "121   Not real Joker\\nThis movie is poorly done as ...      yes  \n",
       "122   HAH HAAH HAAAH HAAAAH HAAAAAH HAAAAAAH HAAAAA...      yes  \n",
       "\n",
       "[123 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.107162</td>\n",
       "      <td>5554</td>\n",
       "      <td>funny like a clown\\nGreetings again from the ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.014881</td>\n",
       "      <td>473</td>\n",
       "      <td>Only certain people can relate\\nThis is a mov...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.008294</td>\n",
       "      <td>2509</td>\n",
       "      <td>\"That's Life.\"\\nIn an era of cinema so satura...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.036939</td>\n",
       "      <td>4022</td>\n",
       "      <td>Best DC movie since The Dark Knight Rises\\nDC...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.017162</td>\n",
       "      <td>1430</td>\n",
       "      <td>unbelievable, unrelatable, a bit boring to be...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>118</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.065000</td>\n",
       "      <td>353</td>\n",
       "      <td>Nerve-wracking, but in very uncomfortable way...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>119</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.035557</td>\n",
       "      <td>3501</td>\n",
       "      <td>Solid film but there are glaring problems\\nOk...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>120</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.250203</td>\n",
       "      <td>510</td>\n",
       "      <td>Joker &gt; Endgame\\nNeed I say more? Everything ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>121</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.003030</td>\n",
       "      <td>424</td>\n",
       "      <td>Absolutely not a 10\\nStrong fanboy and hype r...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>122</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.117628</td>\n",
       "      <td>363</td>\n",
       "      <td>Overhyped, but it's alright\\nIt's a good film...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>123 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  sentiment  length  \\\n",
       "0     pos        pos   0.107162    5554   \n",
       "1     pos        pos   0.014881     473   \n",
       "2     pos        pos   0.008294    2509   \n",
       "3     pos        pos   0.036939    4022   \n",
       "4     pos        neg  -0.017162    1430   \n",
       "..    ...        ...        ...     ...   \n",
       "118   pos        pos   0.065000     353   \n",
       "119   pos        pos   0.035557    3501   \n",
       "120   pos        pos   0.250203     510   \n",
       "121   pos        pos   0.003030     424   \n",
       "122   pos        pos   0.117628     363   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0     funny like a clown\\nGreetings again from the ...      yes  \n",
       "1     Only certain people can relate\\nThis is a mov...      yes  \n",
       "2     \"That's Life.\"\\nIn an era of cinema so satura...      yes  \n",
       "3     Best DC movie since The Dark Knight Rises\\nDC...      yes  \n",
       "4     unbelievable, unrelatable, a bit boring to be...       no  \n",
       "..                                                 ...      ...  \n",
       "118   Nerve-wracking, but in very uncomfortable way...      yes  \n",
       "119   Solid film but there are glaring problems\\nOk...      yes  \n",
       "120   Joker > Endgame\\nNeed I say more? Everything ...      yes  \n",
       "121   Absolutely not a 10\\nStrong fanboy and hype r...      yes  \n",
       "122   Overhyped, but it's alright\\nIt's a good film...      yes  \n",
       "\n",
       "[123 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 64\n",
      "CORRECT PREDICT POS: 114\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_joker, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_joker, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# VADER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pn(num):\n",
    "    return 'neg' if num < 0 else 'pos'\n",
    "\n",
    "def get_vader_scores(array, label):\n",
    "    vader_array = []\n",
    "    for sentence in array:\n",
    "        ss = sid.polarity_scores(sentence)\n",
    "        vader_array.append({'label': label,\n",
    "                            'prediction': get_pn(ss['compound']),\n",
    "                            'compound': ss['compound'], \n",
    "                            'excerpt': sentence[:50]})\n",
    "    return vader_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICIAL INTELLIGENCE TOOK OUR JOBS.\",\n",
       " \"How can we trust Artificial Intelligence to drive our cars when they can't even hack a captcha?!\",\n",
       " 'I hate artificial intelligence!',\n",
       " 'My dog is terrified by artificial intelligence!',\n",
       " 'Artificial intelligence is going to melt the brains of our children!']"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neg_k"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 1: Kendra's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.5255</td>\n",
       "      <td>WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7712</td>\n",
       "      <td>How can we trust Artificial Intelligence to dr...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.2244</td>\n",
       "      <td>I hate artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.2942</td>\n",
       "      <td>My dog is terrified by artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.5255</td>\n",
       "      <td>Artificial intelligence is going to melt the b...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   neg        pos    0.5255   \n",
       "1   neg        pos    0.7712   \n",
       "2   neg        neg   -0.2244   \n",
       "3   neg        neg   -0.2942   \n",
       "4   neg        pos    0.5255   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...       no  \n",
       "1  How can we trust Artificial Intelligence to dr...       no  \n",
       "2                    I hate artificial intelligence!      yes  \n",
       "3    My dog is terrified by artificial intelligence!      yes  \n",
       "4  Artificial intelligence is going to melt the b...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.6705</td>\n",
       "      <td>My dog is excited by the advancements in artif...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8271</td>\n",
       "      <td>I'm excited for my child to grow up and have t...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8221</td>\n",
       "      <td>I love artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8213</td>\n",
       "      <td>Order my groceries, pay my taxes, take my kids...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8402</td>\n",
       "      <td>I'm grateful every day that my child will like...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   pos        pos    0.6705   \n",
       "1   pos        pos    0.8271   \n",
       "2   pos        pos    0.8221   \n",
       "3   pos        pos    0.8213   \n",
       "4   pos        pos    0.8402   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  My dog is excited by the advancements in artif...      yes  \n",
       "1  I'm excited for my child to grow up and have t...      yes  \n",
       "2                    I love artificial intelligence!      yes  \n",
       "3  Order my groceries, pay my taxes, take my kids...      yes  \n",
       "4  I'm grateful every day that my child will like...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 2\n",
      "CORRECT PREDICT POS: 5\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_k, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_k, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 2: Ami's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7836</td>\n",
       "      <td>that's exactly how long the movie felt to me ....</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.8481</td>\n",
       "      <td>\" quest for camelot \" is warner bros . ' firs...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9753</td>\n",
       "      <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.6824</td>\n",
       "      <td>synopsis : a mentally unstable man undergoing ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9879</td>\n",
       "      <td>capsule : in 2176 on the planet mars police ta...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   neg        pos    0.7836   \n",
       "1   neg        neg   -0.8481   \n",
       "2   neg        neg   -0.9753   \n",
       "3   neg        pos    0.6824   \n",
       "4   neg        neg   -0.9879   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  that's exactly how long the movie felt to me ....       no  \n",
       "1   \" quest for camelot \" is warner bros . ' firs...      yes  \n",
       "2  so ask yourself what \" 8mm \" ( \" eight millime...      yes  \n",
       "3  synopsis : a mentally unstable man undergoing ...       no  \n",
       "4  capsule : in 2176 on the planet mars police ta...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.5887</td>\n",
       "      <td>films adapted from comic books have had plenty...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9964</td>\n",
       "      <td>you've got mail works alot better than it dese...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9868</td>\n",
       "      <td>\" jaws \" is a rare film that grabs your atten...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8825</td>\n",
       "      <td>every now and then a movie comes along from a ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.3525</td>\n",
       "      <td>moviemaking is a lot like being the general ma...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   pos        neg   -0.5887   \n",
       "1   pos        pos    0.9964   \n",
       "2   pos        pos    0.9868   \n",
       "3   pos        pos    0.8825   \n",
       "4   pos        neg   -0.3525   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  films adapted from comic books have had plenty...       no  \n",
       "1  you've got mail works alot better than it dese...      yes  \n",
       "2   \" jaws \" is a rare film that grabs your atten...      yes  \n",
       "3  every now and then a movie comes along from a ...      yes  \n",
       "4  moviemaking is a lot like being the general ma...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 3\n",
      "CORRECT PREDICT POS: 3\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_a, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_a, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 3: Cornell Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9695</td>\n",
       "      <td>bad . bad . \\nbad . \\nthat one word seems to p...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.1722</td>\n",
       "      <td>isn't it the ultimate sign of a movie's cinema...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9970</td>\n",
       "      <td>\" gordy \" is not a movie , it is a 90-minute-...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9861</td>\n",
       "      <td>disconnect the phone line . \\ndon't accept the...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7445</td>\n",
       "      <td>when robert forster found himself famous again...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9828</td>\n",
       "      <td>synopsis : when a meteorite crashlands in the ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8979</td>\n",
       "      <td>it's now the anniversary of the slayings of ju...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9371</td>\n",
       "      <td>coinciding with the emerging popularity of mov...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9923</td>\n",
       "      <td>and now the high-flying hong kong style of fil...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9837</td>\n",
       "      <td>battlefield long , boring and just plain stupi...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  compound  \\\n",
       "0     neg        pos    0.9695   \n",
       "1     neg        pos    0.1722   \n",
       "2     neg        neg   -0.9970   \n",
       "3     neg        pos    0.9861   \n",
       "4     neg        pos    0.7445   \n",
       "..    ...        ...       ...   \n",
       "995   neg        pos    0.9828   \n",
       "996   neg        pos    0.8979   \n",
       "997   neg        neg   -0.9371   \n",
       "998   neg        neg   -0.9923   \n",
       "999   neg        neg   -0.9837   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0    bad . bad . \\nbad . \\nthat one word seems to p...       no  \n",
       "1    isn't it the ultimate sign of a movie's cinema...       no  \n",
       "2     \" gordy \" is not a movie , it is a 90-minute-...      yes  \n",
       "3    disconnect the phone line . \\ndon't accept the...       no  \n",
       "4    when robert forster found himself famous again...       no  \n",
       "..                                                 ...      ...  \n",
       "995  synopsis : when a meteorite crashlands in the ...       no  \n",
       "996  it's now the anniversary of the slayings of ju...       no  \n",
       "997  coinciding with the emerging popularity of mov...      yes  \n",
       "998  and now the high-flying hong kong style of fil...      yes  \n",
       "999  battlefield long , boring and just plain stupi...      yes  \n",
       "\n",
       "[1000 rows x 5 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9985</td>\n",
       "      <td>assume nothing . \\nthe phrase is perhaps one o...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9853</td>\n",
       "      <td>plot : derek zoolander is a male model . \\nhe ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9998</td>\n",
       "      <td>i actually am a fan of the original 1961 or so...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9671</td>\n",
       "      <td>a movie that's been as highly built up as the ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>\" good will hunting \" is two movies in one : ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9913</td>\n",
       "      <td>one of the funniest carry on movies and the th...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9985</td>\n",
       "      <td>i remember making a pact , right after `patch ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9964</td>\n",
       "      <td>barely scrapping by playing at a nyc piano bar...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9975</td>\n",
       "      <td>if the current trends of hollywood filmmaking ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9914</td>\n",
       "      <td>capsule : the director of cure brings a weird ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  compound  \\\n",
       "0     pos        pos    0.9985   \n",
       "1     pos        pos    0.9853   \n",
       "2     pos        pos    0.9998   \n",
       "3     pos        pos    0.9671   \n",
       "4     pos        pos    0.9300   \n",
       "..    ...        ...       ...   \n",
       "995   pos        pos    0.9913   \n",
       "996   pos        pos    0.9985   \n",
       "997   pos        pos    0.9964   \n",
       "998   pos        pos    0.9975   \n",
       "999   pos        neg   -0.9914   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0    assume nothing . \\nthe phrase is perhaps one o...      yes  \n",
       "1    plot : derek zoolander is a male model . \\nhe ...      yes  \n",
       "2    i actually am a fan of the original 1961 or so...      yes  \n",
       "3    a movie that's been as highly built up as the ...      yes  \n",
       "4     \" good will hunting \" is two movies in one : ...      yes  \n",
       "..                                                 ...      ...  \n",
       "995  one of the funniest carry on movies and the th...      yes  \n",
       "996  i remember making a pact , right after `patch ...      yes  \n",
       "997  barely scrapping by playing at a nyc piano bar...      yes  \n",
       "998  if the current trends of hollywood filmmaking ...      yes  \n",
       "999  capsule : the director of cure brings a weird ...       no  \n",
       "\n",
       "[1000 rows x 5 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 445\n",
      "CORRECT PREDICT POS: 828\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_cornell, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_cornell, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 4: Dirty Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9326</td>\n",
       "      <td>by starring in amy heckerlings  clueless  two ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8326</td>\n",
       "      <td>i have little against remakes and updates of o...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9491</td>\n",
       "      <td>i cant recall a previous film experience where...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9854</td>\n",
       "      <td>the tagline for this film is :  some houses ar...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.8077</td>\n",
       "      <td>warner brothers ; rated pg-13 ( mild violence ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9927</td>\n",
       "      <td>`the bachelor is one of the best terrible movi...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9803</td>\n",
       "      <td>as a hot-shot defense attorney , kevin lomax (...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.3950</td>\n",
       "      <td>violence is bad . violence is ugly . violence ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9827</td>\n",
       "      <td>even though i have the utmost respect for rich...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.5308</td>\n",
       "      <td>an attempt at florida film noir , palmetto fai...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  compound  \\\n",
       "0     neg        neg   -0.9326   \n",
       "1     neg        pos    0.8326   \n",
       "2     neg        pos    0.9491   \n",
       "3     neg        pos    0.9854   \n",
       "4     neg        neg   -0.8077   \n",
       "..    ...        ...       ...   \n",
       "995   neg        pos    0.9927   \n",
       "996   neg        neg   -0.9803   \n",
       "997   neg        neg   -0.3950   \n",
       "998   neg        pos    0.9827   \n",
       "999   neg        neg   -0.5308   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0    by starring in amy heckerlings  clueless  two ...      yes  \n",
       "1    i have little against remakes and updates of o...       no  \n",
       "2    i cant recall a previous film experience where...       no  \n",
       "3    the tagline for this film is :  some houses ar...       no  \n",
       "4    warner brothers ; rated pg-13 ( mild violence ...      yes  \n",
       "..                                                 ...      ...  \n",
       "995  `the bachelor is one of the best terrible movi...       no  \n",
       "996  as a hot-shot defense attorney , kevin lomax (...      yes  \n",
       "997  violence is bad . violence is ugly . violence ...      yes  \n",
       "998  even though i have the utmost respect for rich...       no  \n",
       "999  an attempt at florida film noir , palmetto fai...      yes  \n",
       "\n",
       "[1000 rows x 5 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9888</td>\n",
       "      <td>for the first reel of girls town , you just ca...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9885</td>\n",
       "      <td>field of dreams almost defies description . al...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9806</td>\n",
       "      <td>meet joe black is your classic boy-meets-girl ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9614</td>\n",
       "      <td>an indian runner was more than a courier . he ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9992</td>\n",
       "      <td>every once in a while , when an exceptional fa...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>995</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9920</td>\n",
       "      <td>the laserman : somehow the title of writer-dir...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>996</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9924</td>\n",
       "      <td>i know what you did last summer ,  the first...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>997</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9921</td>\n",
       "      <td>buffalo ? 66 is a very rarely known movie that...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>998</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9574</td>\n",
       "      <td>time bandits , from director terry gilliam , i...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>999</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9947</td>\n",
       "      <td>warren beattys  bulworth  is a caustic politic...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  compound  \\\n",
       "0     pos        neg   -0.9888   \n",
       "1     pos        pos    0.9885   \n",
       "2     pos        pos    0.9806   \n",
       "3     pos        neg   -0.9614   \n",
       "4     pos        pos    0.9992   \n",
       "..    ...        ...       ...   \n",
       "995   pos        pos    0.9920   \n",
       "996   pos        neg   -0.9924   \n",
       "997   pos        pos    0.9921   \n",
       "998   pos        pos    0.9574   \n",
       "999   pos        neg   -0.9947   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0    for the first reel of girls town , you just ca...       no  \n",
       "1    field of dreams almost defies description . al...      yes  \n",
       "2    meet joe black is your classic boy-meets-girl ...      yes  \n",
       "3    an indian runner was more than a courier . he ...       no  \n",
       "4    every once in a while , when an exceptional fa...      yes  \n",
       "..                                                 ...      ...  \n",
       "995  the laserman : somehow the title of writer-dir...      yes  \n",
       "996    i know what you did last summer ,  the first...       no  \n",
       "997  buffalo ? 66 is a very rarely known movie that...      yes  \n",
       "998  time bandits , from director terry gilliam , i...      yes  \n",
       "999  warren beattys  bulworth  is a caustic politic...       no  \n",
       "\n",
       "[1000 rows x 5 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 454\n",
      "CORRECT PREDICT POS: 824\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_dirty, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_dirty, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 5: Joker Review Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.152083</td>\n",
       "      <td>1734</td>\n",
       "      <td>Missed Opportunity\\nI had been very excited t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.001852</td>\n",
       "      <td>328</td>\n",
       "      <td>5/5 for Phoenix's acting..\\nI don't think the...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>145</td>\n",
       "      <td>Everyone praised an overrated movie.\\nOverrat...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.038095</td>\n",
       "      <td>350</td>\n",
       "      <td>What idiotic FIlm\\nI can say that Phoenix is ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.126398</td>\n",
       "      <td>711</td>\n",
       "      <td>Terrible\\nThe only thing good about this movi...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>118</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.290909</td>\n",
       "      <td>432</td>\n",
       "      <td>Boring and disappointing 😣\\nGreat job acting ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>119</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.164710</td>\n",
       "      <td>853</td>\n",
       "      <td>A masterclass in acting nothing more\\nI don't...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>120</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.126667</td>\n",
       "      <td>242</td>\n",
       "      <td>Not equal to the sum of its parts.\\nDespite a...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>121</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.187500</td>\n",
       "      <td>128</td>\n",
       "      <td>Not real Joker\\nThis movie is poorly done as ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>122</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.057436</td>\n",
       "      <td>1212</td>\n",
       "      <td>HAH HAAH HAAAH HAAAAH HAAAAAH HAAAAAAH HAAAAA...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>123 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  sentiment  length  \\\n",
       "0     neg        pos   0.152083    1734   \n",
       "1     neg        neg  -0.001852     328   \n",
       "2     neg        pos   0.200000     145   \n",
       "3     neg        neg  -0.038095     350   \n",
       "4     neg        pos   0.126398     711   \n",
       "..    ...        ...        ...     ...   \n",
       "118   neg        neg  -0.290909     432   \n",
       "119   neg        pos   0.164710     853   \n",
       "120   neg        pos   0.126667     242   \n",
       "121   neg        neg  -0.187500     128   \n",
       "122   neg        neg  -0.057436    1212   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0     Missed Opportunity\\nI had been very excited t...       no  \n",
       "1     5/5 for Phoenix's acting..\\nI don't think the...      yes  \n",
       "2     Everyone praised an overrated movie.\\nOverrat...       no  \n",
       "3     What idiotic FIlm\\nI can say that Phoenix is ...      yes  \n",
       "4     Terrible\\nThe only thing good about this movi...       no  \n",
       "..                                                 ...      ...  \n",
       "118   Boring and disappointing 😣\\nGreat job acting ...      yes  \n",
       "119   A masterclass in acting nothing more\\nI don't...       no  \n",
       "120   Not equal to the sum of its parts.\\nDespite a...       no  \n",
       "121   Not real Joker\\nThis movie is poorly done as ...      yes  \n",
       "122   HAH HAAH HAAAH HAAAAH HAAAAAH HAAAAAAH HAAAAA...      yes  \n",
       "\n",
       "[123 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.107162</td>\n",
       "      <td>5554</td>\n",
       "      <td>funny like a clown\\nGreetings again from the ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.014881</td>\n",
       "      <td>473</td>\n",
       "      <td>Only certain people can relate\\nThis is a mov...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.008294</td>\n",
       "      <td>2509</td>\n",
       "      <td>\"That's Life.\"\\nIn an era of cinema so satura...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.036939</td>\n",
       "      <td>4022</td>\n",
       "      <td>Best DC movie since The Dark Knight Rises\\nDC...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.017162</td>\n",
       "      <td>1430</td>\n",
       "      <td>unbelievable, unrelatable, a bit boring to be...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>118</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.065000</td>\n",
       "      <td>353</td>\n",
       "      <td>Nerve-wracking, but in very uncomfortable way...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>119</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.035557</td>\n",
       "      <td>3501</td>\n",
       "      <td>Solid film but there are glaring problems\\nOk...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>120</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.250203</td>\n",
       "      <td>510</td>\n",
       "      <td>Joker &gt; Endgame\\nNeed I say more? Everything ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>121</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.003030</td>\n",
       "      <td>424</td>\n",
       "      <td>Absolutely not a 10\\nStrong fanboy and hype r...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>122</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.117628</td>\n",
       "      <td>363</td>\n",
       "      <td>Overhyped, but it's alright\\nIt's a good film...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>123 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "    label prediction  sentiment  length  \\\n",
       "0     pos        pos   0.107162    5554   \n",
       "1     pos        pos   0.014881     473   \n",
       "2     pos        pos   0.008294    2509   \n",
       "3     pos        pos   0.036939    4022   \n",
       "4     pos        neg  -0.017162    1430   \n",
       "..    ...        ...        ...     ...   \n",
       "118   pos        pos   0.065000     353   \n",
       "119   pos        pos   0.035557    3501   \n",
       "120   pos        pos   0.250203     510   \n",
       "121   pos        pos   0.003030     424   \n",
       "122   pos        pos   0.117628     363   \n",
       "\n",
       "                                               excerpt accurate  \n",
       "0     funny like a clown\\nGreetings again from the ...      yes  \n",
       "1     Only certain people can relate\\nThis is a mov...      yes  \n",
       "2     \"That's Life.\"\\nIn an era of cinema so satura...      yes  \n",
       "3     Best DC movie since The Dark Knight Rises\\nDC...      yes  \n",
       "4     unbelievable, unrelatable, a bit boring to be...       no  \n",
       "..                                                 ...      ...  \n",
       "118   Nerve-wracking, but in very uncomfortable way...      yes  \n",
       "119   Solid film but there are glaring problems\\nOk...      yes  \n",
       "120   Joker > Endgame\\nNeed I say more? Everything ...      yes  \n",
       "121   Absolutely not a 10\\nStrong fanboy and hype r...      yes  \n",
       "122   Overhyped, but it's alright\\nIt's a good film...      yes  \n",
       "\n",
       "[123 rows x 6 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT NEG: 64\n",
      "CORRECT PREDICT POS: 114\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_joker, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_joker, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "print('CORRECT PREDICT NEG:',(df_n['accurate']=='yes').sum())\n",
    "print('CORRECT PREDICT POS:',(df_p['accurate']=='yes').sum())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NLTK with NaiveBayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.classify import NaiveBayesClassifier\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *\n",
    "\n",
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "def get_nltk_train_test(array, label, num_train):\n",
    "    tokens = [get_tokens(sentence) for sentence in array]\n",
    "    docs = [(sent, label) for sent in tokens]\n",
    "    train_docs = docs[:num_train]\n",
    "    test_docs = docs[num_train:len(array)]\n",
    "    return [train_docs, test_docs]\n",
    "\n",
    "\n",
    "def get_nltk_NB(NEG_DATA, POS_DATA, num_train):\n",
    "    train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg', num_train)\n",
    "    train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos', num_train)\n",
    "\n",
    "    training_docs = train_neg + train_pos\n",
    "    testing_docs = test_neg + test_pos\n",
    "\n",
    "    sentim_analyzer = SentimentAnalyzer()\n",
    "    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])\n",
    "    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)\n",
    "    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)\n",
    "    training_set = sentim_analyzer.apply_features(training_docs)\n",
    "    test_set = sentim_analyzer.apply_features(testing_docs)\n",
    "\n",
    "    trainer = NaiveBayesClassifier.train\n",
    "    classifier = sentim_analyzer.train(trainer, training_set)\n",
    "    \n",
    "    results = []\n",
    "    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):\n",
    "        print('{0}: {1}'.format(key,value))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 1: Kendra's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 1.0\n",
      "F-measure [neg]: 1.0\n",
      "F-measure [pos]: 1.0\n",
      "Precision [neg]: 1.0\n",
      "Precision [pos]: 1.0\n",
      "Recall [neg]: 1.0\n",
      "Recall [pos]: 1.0\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_k, pos_k, 4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 2: Ami's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.5\n",
      "F-measure [neg]: 0.6666666666666666\n",
      "F-measure [pos]: None\n",
      "Precision [neg]: 0.5\n",
      "Precision [pos]: None\n",
      "Recall [neg]: 1.0\n",
      "Recall [pos]: 0.0\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_a, pos_a, 4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 3: Cornell's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.8125\n",
      "F-measure [neg]: 0.8259860788863109\n",
      "F-measure [pos]: 0.7967479674796748\n",
      "Precision [neg]: 0.7705627705627706\n",
      "Precision [pos]: 0.8698224852071006\n",
      "Recall [neg]: 0.89\n",
      "Recall [pos]: 0.735\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_cornell, pos_cornell, 800)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 4: Dirty Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.7775\n",
      "F-measure [neg]: 0.7944572748267898\n",
      "F-measure [pos]: 0.757493188010899\n",
      "Precision [neg]: 0.7381974248927039\n",
      "Precision [pos]: 0.8323353293413174\n",
      "Recall [neg]: 0.86\n",
      "Recall [pos]: 0.695\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_dirty, pos_dirty, 800)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 5: Joker Review Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.581081081081081\n",
      "F-measure [neg]: 0.6593406593406593\n",
      "F-measure [pos]: 0.456140350877193\n",
      "Precision [neg]: 0.5555555555555556\n",
      "Precision [pos]: 0.65\n",
      "Recall [neg]: 0.8108108108108109\n",
      "Recall [pos]: 0.35135135135135137\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_joker, pos_joker, 86)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Naive Bayes Gaussian Style"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "def create_labeled_df(array, label):\n",
    "    df = pd.DataFrame(array)\n",
    "    df['label'] = label\n",
    "    return df\n",
    "\n",
    "def get_nb_gaus(neg, pos):\n",
    "    neg_df = create_labeled_df(neg, 'neg')\n",
    "    pos_df = create_labeled_df(pos, 'pos')\n",
    "    all_df = neg_df.append(pos_df)\n",
    "    print(all_df)\n",
    "#     print(neg[:5], pos[:5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                                                     0 label\n",
      "0     Missed Opportunity\\nI had been very excited t...   neg\n",
      "1     5/5 for Phoenix's acting..\\nI don't think the...   neg\n",
      "2     Everyone praised an overrated movie.\\nOverrat...   neg\n",
      "3     What idiotic FIlm\\nI can say that Phoenix is ...   neg\n",
      "4     Terrible\\nThe only thing good about this movi...   neg\n",
      "..                                                 ...   ...\n",
      "118   Nerve-wracking, but in very uncomfortable way...   pos\n",
      "119   Solid film but there are glaring problems\\nOk...   pos\n",
      "120   Joker > Endgame\\nNeed I say more? Everything ...   pos\n",
      "121   Absolutely not a 10\\nStrong fanboy and hype r...   pos\n",
      "122   Overhyped, but it's alright\\nIt's a good film...   pos\n",
      "\n",
      "[246 rows x 2 columns]\n"
     ]
    }
   ],
   "source": [
    "get_nb_gaus(neg_joker, pos_joker)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}