{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Sentiment Analysis\n",
    "## TextBlob + Vader + NLTK + Naive Bayes\n",
    "via [this tutorial](https://levelup.gitconnected.com/sentiment-analysis-using-machine-learning-python-9122e03f8f7b) |10-6-19"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from textblob import TextBlob\n",
    "from IPython.display import display, HTML\n",
    "import os\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "sid = SentimentIntensityAnalyzer()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file)\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "# HW 1\n",
    "neg_k = get_data_from_files('AI_NEG/')\n",
    "pos_k = get_data_from_files('AI_POS/')\n",
    "neg_a = get_data_from_files('NEG/')\n",
    "pos_a = get_data_from_files('POS/')\n",
    "\n",
    "# HW2\n",
    "neg_cornell = get_data_from_files('neg_cornell/')\n",
    "pos_cornell = get_data_from_files('pos_cornell/')\n",
    "\n",
    "# HW3\n",
    "neg_dirty = get_data_from_files('NEG_dirty/')\n",
    "pos_dirty = get_data_from_files('POS_dirty/')\n",
    "neg_joker = get_data_from_files('NEG_JK/')\n",
    "pos_joker = get_data_from_files('POS_JK/')\n",
    "\n",
    "# HW4\n",
    "neg_hw4 = get_data_from_files('neg_hw4/')\n",
    "pos_hw4 = get_data_from_files('pos_hw4/')\n",
    "\n",
    "# HW4\n",
    "false_lie_hw4 = get_data_from_files('hw4_lie_false/')\n",
    "true_lie_hw4 = get_data_from_files('hw4_lie_true/')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TEXT BLOB"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pn(num):\n",
    "    return 'neg' if num < 0 else 'pos'\n",
    "\n",
    "def get_sentiment(array, label):\n",
    "    blobs = [[TextBlob(text), text] for text in array]\n",
    "    return ([{'label': label,\n",
    "              'prediction': get_pn(obj.sentiment.polarity),\n",
    "              'sentiment': obj.sentiment.polarity,\n",
    "              'length': len(text), \n",
    "              'excerpt': text[:50]} for obj,text in blobs])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 1: Kendra's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.157143</td>\n",
       "      <td>76</td>\n",
       "      <td>WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.750000</td>\n",
       "      <td>96</td>\n",
       "      <td>How can we trust Artificial Intelligence to dr...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.775000</td>\n",
       "      <td>31</td>\n",
       "      <td>I hate artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.750000</td>\n",
       "      <td>47</td>\n",
       "      <td>My dog is terrified by artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.750000</td>\n",
       "      <td>68</td>\n",
       "      <td>Artificial intelligence is going to melt the b...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   neg        neg  -0.157143      76   \n",
       "1   neg        neg  -0.750000      96   \n",
       "2   neg        neg  -0.775000      31   \n",
       "3   neg        neg  -0.750000      47   \n",
       "4   neg        neg  -0.750000      68   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...      yes  \n",
       "1  How can we trust Artificial Intelligence to dr...      yes  \n",
       "2                    I hate artificial intelligence!      yes  \n",
       "3    My dog is terrified by artificial intelligence!      yes  \n",
       "4  Artificial intelligence is going to melt the b...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.112500</td>\n",
       "      <td>65</td>\n",
       "      <td>My dog is excited by the advancements in artif...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.075000</td>\n",
       "      <td>133</td>\n",
       "      <td>I'm excited for my child to grow up and have t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.125000</td>\n",
       "      <td>31</td>\n",
       "      <td>I love artificial intelligence!</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.300000</td>\n",
       "      <td>121</td>\n",
       "      <td>Order my groceries, pay my taxes, take my kids...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.133333</td>\n",
       "      <td>116</td>\n",
       "      <td>I'm grateful every day that my child will like...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   pos        neg  -0.112500      65   \n",
       "1   pos        neg  -0.075000     133   \n",
       "2   pos        neg  -0.125000      31   \n",
       "3   pos        neg  -0.300000     121   \n",
       "4   pos        neg  -0.133333     116   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  My dog is excited by the advancements in artif...       no  \n",
       "1  I'm excited for my child to grow up and have t...       no  \n",
       "2                    I love artificial intelligence!       no  \n",
       "3  Order my groceries, pay my taxes, take my kids...       no  \n",
       "4  I'm grateful every day that my child will like...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 5 out of 5 1.0\n",
      "CORRECT PREDICT TRUE: 0 out of 5 0.0\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_k, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_k, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 2: Ami's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.054577</td>\n",
       "      <td>3554</td>\n",
       "      <td>that's exactly how long the movie felt to me ....</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.025467</td>\n",
       "      <td>2929</td>\n",
       "      <td>\" quest for camelot \" is warner bros . ' firs...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.003334</td>\n",
       "      <td>3365</td>\n",
       "      <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.022925</td>\n",
       "      <td>4418</td>\n",
       "      <td>synopsis : a mentally unstable man undergoing ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.043234</td>\n",
       "      <td>3911</td>\n",
       "      <td>capsule : in 2176 on the planet mars police ta...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   neg        neg  -0.054577    3554   \n",
       "1   neg        pos   0.025467    2929   \n",
       "2   neg        pos   0.003334    3365   \n",
       "3   neg        pos   0.022925    4418   \n",
       "4   neg        pos   0.043234    3911   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  that's exactly how long the movie felt to me ....      yes  \n",
       "1   \" quest for camelot \" is warner bros . ' firs...       no  \n",
       "2  so ask yourself what \" 8mm \" ( \" eight millime...       no  \n",
       "3  synopsis : a mentally unstable man undergoing ...       no  \n",
       "4  capsule : in 2176 on the planet mars police ta...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.023663</td>\n",
       "      <td>4227</td>\n",
       "      <td>films adapted from comic books have had plenty...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.131092</td>\n",
       "      <td>2421</td>\n",
       "      <td>you've got mail works alot better than it dese...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.110626</td>\n",
       "      <td>6092</td>\n",
       "      <td>\" jaws \" is a rare film that grabs your atten...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.103847</td>\n",
       "      <td>4096</td>\n",
       "      <td>every now and then a movie comes along from a ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.070151</td>\n",
       "      <td>3898</td>\n",
       "      <td>moviemaking is a lot like being the general ma...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   pos        pos   0.023663    4227   \n",
       "1   pos        pos   0.131092    2421   \n",
       "2   pos        pos   0.110626    6092   \n",
       "3   pos        pos   0.103847    4096   \n",
       "4   pos        neg  -0.070151    3898   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  films adapted from comic books have had plenty...      yes  \n",
       "1  you've got mail works alot better than it dese...      yes  \n",
       "2   \" jaws \" is a rare film that grabs your atten...      yes  \n",
       "3  every now and then a movie comes along from a ...      yes  \n",
       "4  moviemaking is a lot like being the general ma...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 1 out of 5 0.2\n",
      "CORRECT PREDICT TRUE: 4 out of 5 0.8\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_a, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_a, 'pos'))\n",
    "\n",
    "import numpy as np\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 3: Cornell Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.026240</td>\n",
       "      <td>5953</td>\n",
       "      <td>bad . bad . \\nbad . \\nthat one word seems to p...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.076040</td>\n",
       "      <td>3396</td>\n",
       "      <td>isn't it the ultimate sign of a movie's cinema...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.128733</td>\n",
       "      <td>2762</td>\n",
       "      <td>\" gordy \" is not a movie , it is a 90-minute-...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.000485</td>\n",
       "      <td>3840</td>\n",
       "      <td>disconnect the phone line . \\ndon't accept the...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.122770</td>\n",
       "      <td>2270</td>\n",
       "      <td>when robert forster found himself famous again...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   neg        pos   0.026240    5953   \n",
       "1   neg        pos   0.076040    3396   \n",
       "2   neg        neg  -0.128733    2762   \n",
       "3   neg        neg  -0.000485    3840   \n",
       "4   neg        pos   0.122770    2270   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  bad . bad . \\nbad . \\nthat one word seems to p...       no  \n",
       "1  isn't it the ultimate sign of a movie's cinema...       no  \n",
       "2   \" gordy \" is not a movie , it is a 90-minute-...      yes  \n",
       "3  disconnect the phone line . \\ndon't accept the...      yes  \n",
       "4  when robert forster found himself famous again...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.221173</td>\n",
       "      <td>4662</td>\n",
       "      <td>assume nothing . \\nthe phrase is perhaps one o...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.089736</td>\n",
       "      <td>3839</td>\n",
       "      <td>plot : derek zoolander is a male model . \\nhe ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.206743</td>\n",
       "      <td>9380</td>\n",
       "      <td>i actually am a fan of the original 1961 or so...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.141905</td>\n",
       "      <td>2407</td>\n",
       "      <td>a movie that's been as highly built up as the ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.176332</td>\n",
       "      <td>1840</td>\n",
       "      <td>\" good will hunting \" is two movies in one : ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   pos        pos   0.221173    4662   \n",
       "1   pos        pos   0.089736    3839   \n",
       "2   pos        pos   0.206743    9380   \n",
       "3   pos        pos   0.141905    2407   \n",
       "4   pos        pos   0.176332    1840   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  assume nothing . \\nthe phrase is perhaps one o...      yes  \n",
       "1  plot : derek zoolander is a male model . \\nhe ...      yes  \n",
       "2  i actually am a fan of the original 1961 or so...      yes  \n",
       "3  a movie that's been as highly built up as the ...      yes  \n",
       "4   \" good will hunting \" is two movies in one : ...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 229 out of 1000 0.229\n",
      "CORRECT PREDICT TRUE: 971 out of 1000 0.971\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_cornell, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_cornell, 'pos'))\n",
    "\n",
    "import numpy as np\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 4: Dirty Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.004665</td>\n",
       "      <td>3777</td>\n",
       "      <td>by starring in amy heckerlings  clueless  two ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.119184</td>\n",
       "      <td>3639</td>\n",
       "      <td>i have little against remakes and updates of o...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.100886</td>\n",
       "      <td>4247</td>\n",
       "      <td>i cant recall a previous film experience where...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.097526</td>\n",
       "      <td>4308</td>\n",
       "      <td>the tagline for this film is :  some houses ar...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.048745</td>\n",
       "      <td>5175</td>\n",
       "      <td>warner brothers ; rated pg-13 ( mild violence ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   neg        neg  -0.004665    3777   \n",
       "1   neg        pos   0.119184    3639   \n",
       "2   neg        pos   0.100886    4247   \n",
       "3   neg        pos   0.097526    4308   \n",
       "4   neg        pos   0.048745    5175   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  by starring in amy heckerlings  clueless  two ...      yes  \n",
       "1  i have little against remakes and updates of o...       no  \n",
       "2  i cant recall a previous film experience where...       no  \n",
       "3  the tagline for this film is :  some houses ar...       no  \n",
       "4  warner brothers ; rated pg-13 ( mild violence ...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.134641</td>\n",
       "      <td>4584</td>\n",
       "      <td>for the first reel of girls town , you just ca...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.137134</td>\n",
       "      <td>3102</td>\n",
       "      <td>field of dreams almost defies description . al...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.181355</td>\n",
       "      <td>3521</td>\n",
       "      <td>meet joe black is your classic boy-meets-girl ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.104101</td>\n",
       "      <td>2192</td>\n",
       "      <td>an indian runner was more than a courier . he ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.204967</td>\n",
       "      <td>4955</td>\n",
       "      <td>every once in a while , when an exceptional fa...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   pos        pos   0.134641    4584   \n",
       "1   pos        pos   0.137134    3102   \n",
       "2   pos        pos   0.181355    3521   \n",
       "3   pos        pos   0.104101    2192   \n",
       "4   pos        pos   0.204967    4955   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  for the first reel of girls town , you just ca...      yes  \n",
       "1  field of dreams almost defies description . al...      yes  \n",
       "2  meet joe black is your classic boy-meets-girl ...      yes  \n",
       "3  an indian runner was more than a courier . he ...      yes  \n",
       "4  every once in a while , when an exceptional fa...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 227 out of 1000 0.227\n",
      "CORRECT PREDICT TRUE: 972 out of 1000 0.972\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_dirty, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_dirty, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 5: Joker Review Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.152083</td>\n",
       "      <td>1734</td>\n",
       "      <td>Missed Opportunity\\nI had been very excited t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.001852</td>\n",
       "      <td>328</td>\n",
       "      <td>5/5 for Phoenix's acting..\\nI don't think the...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.200000</td>\n",
       "      <td>145</td>\n",
       "      <td>Everyone praised an overrated movie.\\nOverrat...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.038095</td>\n",
       "      <td>350</td>\n",
       "      <td>What idiotic FIlm\\nI can say that Phoenix is ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.126398</td>\n",
       "      <td>711</td>\n",
       "      <td>Terrible\\nThe only thing good about this movi...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   neg        pos   0.152083    1734   \n",
       "1   neg        neg  -0.001852     328   \n",
       "2   neg        pos   0.200000     145   \n",
       "3   neg        neg  -0.038095     350   \n",
       "4   neg        pos   0.126398     711   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0   Missed Opportunity\\nI had been very excited t...       no  \n",
       "1   5/5 for Phoenix's acting..\\nI don't think the...      yes  \n",
       "2   Everyone praised an overrated movie.\\nOverrat...       no  \n",
       "3   What idiotic FIlm\\nI can say that Phoenix is ...      yes  \n",
       "4   Terrible\\nThe only thing good about this movi...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.107162</td>\n",
       "      <td>5554</td>\n",
       "      <td>funny like a clown\\nGreetings again from the ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.014881</td>\n",
       "      <td>473</td>\n",
       "      <td>Only certain people can relate\\nThis is a mov...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.008294</td>\n",
       "      <td>2509</td>\n",
       "      <td>\"That's Life.\"\\nIn an era of cinema so satura...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.036939</td>\n",
       "      <td>4022</td>\n",
       "      <td>Best DC movie since The Dark Knight Rises\\nDC...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.017162</td>\n",
       "      <td>1430</td>\n",
       "      <td>unbelievable, unrelatable, a bit boring to be...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   pos        pos   0.107162    5554   \n",
       "1   pos        pos   0.014881     473   \n",
       "2   pos        pos   0.008294    2509   \n",
       "3   pos        pos   0.036939    4022   \n",
       "4   pos        neg  -0.017162    1430   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0   funny like a clown\\nGreetings again from the ...      yes  \n",
       "1   Only certain people can relate\\nThis is a mov...      yes  \n",
       "2   \"That's Life.\"\\nIn an era of cinema so satura...      yes  \n",
       "3   Best DC movie since The Dark Knight Rises\\nDC...      yes  \n",
       "4   unbelievable, unrelatable, a bit boring to be...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 64 out of 123 0.5203252032520326\n",
      "CORRECT PREDICT TRUE: 114 out of 123 0.926829268292683\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_joker, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_joker, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 6: HW4 [Sentiment]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.273958</td>\n",
       "      <td>251</td>\n",
       "      <td>I went to XYZ restaurant last week and I was v...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.083333</td>\n",
       "      <td>359</td>\n",
       "      <td>In each of the diner dish there are at least o...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.134722</td>\n",
       "      <td>748</td>\n",
       "      <td>This is the last place you would want to dine ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.166667</td>\n",
       "      <td>378</td>\n",
       "      <td>I went to this restaurant where I had ordered ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.152455</td>\n",
       "      <td>381</td>\n",
       "      <td>I went there with two friends at 6pm. Long que...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   neg        neg  -0.273958     251   \n",
       "1   neg        pos   0.083333     359   \n",
       "2   neg        neg  -0.134722     748   \n",
       "3   neg        neg  -0.166667     378   \n",
       "4   neg        pos   0.152455     381   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  I went to XYZ restaurant last week and I was v...      yes  \n",
       "1  In each of the diner dish there are at least o...       no  \n",
       "2  This is the last place you would want to dine ...      yes  \n",
       "3  I went to this restaurant where I had ordered ...      yes  \n",
       "4  I went there with two friends at 6pm. Long que...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.626786</td>\n",
       "      <td>132</td>\n",
       "      <td>This restaurant ROCKS! I mean the food is grea...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.500000</td>\n",
       "      <td>441</td>\n",
       "      <td>Stronghearts cafe is the BEST! The owners have...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.480208</td>\n",
       "      <td>485</td>\n",
       "      <td>I went to cruise dinner in NYC with Spirit Cru...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.240278</td>\n",
       "      <td>404</td>\n",
       "      <td>Halos is home. I have been here numerous times...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.552083</td>\n",
       "      <td>324</td>\n",
       "      <td>The best restaurant I have ever been was a sma...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   pos        pos   0.626786     132   \n",
       "1   pos        pos   0.500000     441   \n",
       "2   pos        pos   0.480208     485   \n",
       "3   pos        pos   0.240278     404   \n",
       "4   pos        pos   0.552083     324   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  This restaurant ROCKS! I mean the food is grea...      yes  \n",
       "1  Stronghearts cafe is the BEST! The owners have...      yes  \n",
       "2  I went to cruise dinner in NYC with Spirit Cru...      yes  \n",
       "3  Halos is home. I have been here numerous times...      yes  \n",
       "4  The best restaurant I have ever been was a sma...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 26 out of 46 0.5652173913043478\n",
      "CORRECT PREDICT TRUE: 46 out of 46 1.0\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(neg_hw4, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(pos_hw4, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 7: HW4 [Deception]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.442752</td>\n",
       "      <td>386</td>\n",
       "      <td>Gannon’s Isle Ice Cream served the best ice cr...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.197500</td>\n",
       "      <td>340</td>\n",
       "      <td>Hibachi the grill is one of my favorite restau...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.353912</td>\n",
       "      <td>790</td>\n",
       "      <td>RIM KAAP One of the best Thai restaurants in t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.578788</td>\n",
       "      <td>391</td>\n",
       "      <td>It is a France restaurant which has Michelin t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.331373</td>\n",
       "      <td>710</td>\n",
       "      <td>Its hard to pick a favorite dining experience ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   neg        pos   0.442752     386   \n",
       "1   neg        pos   0.197500     340   \n",
       "2   neg        pos   0.353912     790   \n",
       "3   neg        pos   0.578788     391   \n",
       "4   neg        pos   0.331373     710   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  Gannon’s Isle Ice Cream served the best ice cr...       no  \n",
       "1  Hibachi the grill is one of my favorite restau...       no  \n",
       "2  RIM KAAP One of the best Thai restaurants in t...       no  \n",
       "3  It is a France restaurant which has Michelin t...       no  \n",
       "4  Its hard to pick a favorite dining experience ...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>sentiment</th>\n",
       "      <th>length</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>?</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.236833</td>\n",
       "      <td>289</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.249762</td>\n",
       "      <td>519</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>1</td>\n",
       "      <td>?</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.019481</td>\n",
       "      <td>234</td>\n",
       "      <td>I have been to a Asian restaurant in New York ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  sentiment  length  \\\n",
       "0   pos        pos   0.000000       1   \n",
       "1   pos        pos   0.236833     289   \n",
       "2   pos        neg  -0.249762     519   \n",
       "3   pos        pos   0.000000       1   \n",
       "4   pos        pos   0.019481     234   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0                                                  ?      yes  \n",
       "1  Twin Trees Cicero NY HUGE salad bar and high q...      yes  \n",
       "2  The worst restaurant that I have ever eaten in...       no  \n",
       "3                                                  ?      yes  \n",
       "4  I have been to a Asian restaurant in New York ...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 14 out of 46 0.30434782608695654\n",
      "CORRECT PREDICT TRUE: 34 out of 46 0.7391304347826086\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_sentiment(false_lie_hw4, 'neg'))\n",
    "df_p = pd.DataFrame(get_sentiment(true_lie_hw4, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# VADER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_pn(num):\n",
    "    return 'neg' if num < 0 else 'pos'\n",
    "\n",
    "def get_vader_scores(array, label):\n",
    "    vader_array = []\n",
    "    for sentence in array:\n",
    "        ss = sid.polarity_scores(sentence)\n",
    "        vader_array.append({'label': label,\n",
    "                            'prediction': get_pn(ss['compound']),\n",
    "                            'compound': ss['compound'], \n",
    "                            'excerpt': sentence[:50]})\n",
    "    return vader_array"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[\"WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICIAL INTELLIGENCE TOOK OUR JOBS.\",\n",
       " \"How can we trust Artificial Intelligence to drive our cars when they can't even hack a captcha?!\",\n",
       " 'I hate artificial intelligence!',\n",
       " 'My dog is terrified by artificial intelligence!',\n",
       " 'Artificial intelligence is going to melt the brains of our children!']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "neg_k"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 1: Kendra's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.5255</td>\n",
       "      <td>WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7712</td>\n",
       "      <td>How can we trust Artificial Intelligence to dr...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.2244</td>\n",
       "      <td>I hate artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.2942</td>\n",
       "      <td>My dog is terrified by artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.5255</td>\n",
       "      <td>Artificial intelligence is going to melt the b...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   neg        pos    0.5255   \n",
       "1   neg        pos    0.7712   \n",
       "2   neg        neg   -0.2244   \n",
       "3   neg        neg   -0.2942   \n",
       "4   neg        pos    0.5255   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...       no  \n",
       "1  How can we trust Artificial Intelligence to dr...       no  \n",
       "2                    I hate artificial intelligence!      yes  \n",
       "3    My dog is terrified by artificial intelligence!      yes  \n",
       "4  Artificial intelligence is going to melt the b...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.6705</td>\n",
       "      <td>My dog is excited by the advancements in artif...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8271</td>\n",
       "      <td>I'm excited for my child to grow up and have t...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8221</td>\n",
       "      <td>I love artificial intelligence!</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8213</td>\n",
       "      <td>Order my groceries, pay my taxes, take my kids...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8402</td>\n",
       "      <td>I'm grateful every day that my child will like...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   pos        pos    0.6705   \n",
       "1   pos        pos    0.8271   \n",
       "2   pos        pos    0.8221   \n",
       "3   pos        pos    0.8213   \n",
       "4   pos        pos    0.8402   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  My dog is excited by the advancements in artif...      yes  \n",
       "1  I'm excited for my child to grow up and have t...      yes  \n",
       "2                    I love artificial intelligence!      yes  \n",
       "3  Order my groceries, pay my taxes, take my kids...      yes  \n",
       "4  I'm grateful every day that my child will like...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 2 out of 5 0.4\n",
      "CORRECT PREDICT TRUE: 5 out of 5 1.0\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_k, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_k, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 2: Ami's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7836</td>\n",
       "      <td>that's exactly how long the movie felt to me ....</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.8481</td>\n",
       "      <td>\" quest for camelot \" is warner bros . ' firs...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9753</td>\n",
       "      <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.6824</td>\n",
       "      <td>synopsis : a mentally unstable man undergoing ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9879</td>\n",
       "      <td>capsule : in 2176 on the planet mars police ta...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   neg        pos    0.7836   \n",
       "1   neg        neg   -0.8481   \n",
       "2   neg        neg   -0.9753   \n",
       "3   neg        pos    0.6824   \n",
       "4   neg        neg   -0.9879   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  that's exactly how long the movie felt to me ....       no  \n",
       "1   \" quest for camelot \" is warner bros . ' firs...      yes  \n",
       "2  so ask yourself what \" 8mm \" ( \" eight millime...      yes  \n",
       "3  synopsis : a mentally unstable man undergoing ...       no  \n",
       "4  capsule : in 2176 on the planet mars police ta...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.5887</td>\n",
       "      <td>films adapted from comic books have had plenty...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9964</td>\n",
       "      <td>you've got mail works alot better than it dese...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9868</td>\n",
       "      <td>\" jaws \" is a rare film that grabs your atten...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8825</td>\n",
       "      <td>every now and then a movie comes along from a ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.3525</td>\n",
       "      <td>moviemaking is a lot like being the general ma...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   pos        neg   -0.5887   \n",
       "1   pos        pos    0.9964   \n",
       "2   pos        pos    0.9868   \n",
       "3   pos        pos    0.8825   \n",
       "4   pos        neg   -0.3525   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  films adapted from comic books have had plenty...       no  \n",
       "1  you've got mail works alot better than it dese...      yes  \n",
       "2   \" jaws \" is a rare film that grabs your atten...      yes  \n",
       "3  every now and then a movie comes along from a ...      yes  \n",
       "4  moviemaking is a lot like being the general ma...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 3 out of 5 0.6\n",
      "CORRECT PREDICT TRUE: 3 out of 5 0.6\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_a, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_a, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n)\n",
    "display(df_p)\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 3: Cornell Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9695</td>\n",
       "      <td>bad . bad . \\nbad . \\nthat one word seems to p...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.1722</td>\n",
       "      <td>isn't it the ultimate sign of a movie's cinema...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9970</td>\n",
       "      <td>\" gordy \" is not a movie , it is a 90-minute-...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9861</td>\n",
       "      <td>disconnect the phone line . \\ndon't accept the...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7445</td>\n",
       "      <td>when robert forster found himself famous again...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   neg        pos    0.9695   \n",
       "1   neg        pos    0.1722   \n",
       "2   neg        neg   -0.9970   \n",
       "3   neg        pos    0.9861   \n",
       "4   neg        pos    0.7445   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  bad . bad . \\nbad . \\nthat one word seems to p...       no  \n",
       "1  isn't it the ultimate sign of a movie's cinema...       no  \n",
       "2   \" gordy \" is not a movie , it is a 90-minute-...      yes  \n",
       "3  disconnect the phone line . \\ndon't accept the...       no  \n",
       "4  when robert forster found himself famous again...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9985</td>\n",
       "      <td>assume nothing . \\nthe phrase is perhaps one o...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9853</td>\n",
       "      <td>plot : derek zoolander is a male model . \\nhe ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9998</td>\n",
       "      <td>i actually am a fan of the original 1961 or so...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9671</td>\n",
       "      <td>a movie that's been as highly built up as the ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9300</td>\n",
       "      <td>\" good will hunting \" is two movies in one : ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   pos        pos    0.9985   \n",
       "1   pos        pos    0.9853   \n",
       "2   pos        pos    0.9998   \n",
       "3   pos        pos    0.9671   \n",
       "4   pos        pos    0.9300   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  assume nothing . \\nthe phrase is perhaps one o...      yes  \n",
       "1  plot : derek zoolander is a male model . \\nhe ...      yes  \n",
       "2  i actually am a fan of the original 1961 or so...      yes  \n",
       "3  a movie that's been as highly built up as the ...      yes  \n",
       "4   \" good will hunting \" is two movies in one : ...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 445 out of 1000 0.445\n",
      "CORRECT PREDICT TRUE: 828 out of 1000 0.828\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_cornell, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_cornell, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 4: Dirty Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9326</td>\n",
       "      <td>by starring in amy heckerlings  clueless  two ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8326</td>\n",
       "      <td>i have little against remakes and updates of o...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9491</td>\n",
       "      <td>i cant recall a previous film experience where...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9854</td>\n",
       "      <td>the tagline for this film is :  some houses ar...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.8077</td>\n",
       "      <td>warner brothers ; rated pg-13 ( mild violence ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   neg        neg   -0.9326   \n",
       "1   neg        pos    0.8326   \n",
       "2   neg        pos    0.9491   \n",
       "3   neg        pos    0.9854   \n",
       "4   neg        neg   -0.8077   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  by starring in amy heckerlings  clueless  two ...      yes  \n",
       "1  i have little against remakes and updates of o...       no  \n",
       "2  i cant recall a previous film experience where...       no  \n",
       "3  the tagline for this film is :  some houses ar...       no  \n",
       "4  warner brothers ; rated pg-13 ( mild violence ...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9888</td>\n",
       "      <td>for the first reel of girls town , you just ca...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9885</td>\n",
       "      <td>field of dreams almost defies description . al...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9806</td>\n",
       "      <td>meet joe black is your classic boy-meets-girl ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9614</td>\n",
       "      <td>an indian runner was more than a courier . he ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9992</td>\n",
       "      <td>every once in a while , when an exceptional fa...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   pos        neg   -0.9888   \n",
       "1   pos        pos    0.9885   \n",
       "2   pos        pos    0.9806   \n",
       "3   pos        neg   -0.9614   \n",
       "4   pos        pos    0.9992   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  for the first reel of girls town , you just ca...       no  \n",
       "1  field of dreams almost defies description . al...      yes  \n",
       "2  meet joe black is your classic boy-meets-girl ...      yes  \n",
       "3  an indian runner was more than a courier . he ...       no  \n",
       "4  every once in a while , when an exceptional fa...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 454 out of 1000 0.454\n",
      "CORRECT PREDICT TRUE: 824 out of 1000 0.824\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_dirty, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_dirty, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 5: Joker Review Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7501</td>\n",
       "      <td>Missed Opportunity\\nI had been very excited t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7184</td>\n",
       "      <td>5/5 for Phoenix's acting..\\nI don't think the...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7269</td>\n",
       "      <td>Everyone praised an overrated movie.\\nOverrat...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.6698</td>\n",
       "      <td>What idiotic FIlm\\nI can say that Phoenix is ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.7184</td>\n",
       "      <td>Terrible\\nThe only thing good about this movi...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   neg        pos    0.7501   \n",
       "1   neg        pos    0.7184   \n",
       "2   neg        pos    0.7269   \n",
       "3   neg        neg   -0.6698   \n",
       "4   neg        pos    0.7184   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0   Missed Opportunity\\nI had been very excited t...       no  \n",
       "1   5/5 for Phoenix's acting..\\nI don't think the...       no  \n",
       "2   Everyone praised an overrated movie.\\nOverrat...       no  \n",
       "3   What idiotic FIlm\\nI can say that Phoenix is ...      yes  \n",
       "4   Terrible\\nThe only thing good about this movi...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9976</td>\n",
       "      <td>funny like a clown\\nGreetings again from the ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9231</td>\n",
       "      <td>Only certain people can relate\\nThis is a mov...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9796</td>\n",
       "      <td>\"That's Life.\"\\nIn an era of cinema so satura...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.9586</td>\n",
       "      <td>Best DC movie since The Dark Knight Rises\\nDC...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.8813</td>\n",
       "      <td>unbelievable, unrelatable, a bit boring to be...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   pos        pos    0.9976   \n",
       "1   pos        pos    0.9231   \n",
       "2   pos        pos    0.9796   \n",
       "3   pos        neg   -0.9586   \n",
       "4   pos        neg   -0.8813   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0   funny like a clown\\nGreetings again from the ...      yes  \n",
       "1   Only certain people can relate\\nThis is a mov...      yes  \n",
       "2   \"That's Life.\"\\nIn an era of cinema so satura...      yes  \n",
       "3   Best DC movie since The Dark Knight Rises\\nDC...       no  \n",
       "4   unbelievable, unrelatable, a bit boring to be...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 68 out of 123 0.5528455284552846\n",
      "CORRECT PREDICT TRUE: 94 out of 123 0.7642276422764228\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_joker, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_joker, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 6: HW4 [Sentiment]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.6807</td>\n",
       "      <td>I went to XYZ restaurant last week and I was v...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.6329</td>\n",
       "      <td>In each of the diner dish there are at least o...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.5161</td>\n",
       "      <td>This is the last place you would want to dine ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.5423</td>\n",
       "      <td>I went to this restaurant where I had ordered ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8842</td>\n",
       "      <td>I went there with two friends at 6pm. Long que...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   neg        neg   -0.6807   \n",
       "1   neg        neg   -0.6329   \n",
       "2   neg        pos    0.5161   \n",
       "3   neg        neg   -0.5423   \n",
       "4   neg        pos    0.8842   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  I went to XYZ restaurant last week and I was v...      yes  \n",
       "1  In each of the diner dish there are at least o...      yes  \n",
       "2  This is the last place you would want to dine ...       no  \n",
       "3  I went to this restaurant where I had ordered ...      yes  \n",
       "4  I went there with two friends at 6pm. Long que...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9840</td>\n",
       "      <td>This restaurant ROCKS! I mean the food is grea...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9702</td>\n",
       "      <td>Stronghearts cafe is the BEST! The owners have...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9106</td>\n",
       "      <td>I went to cruise dinner in NYC with Spirit Cru...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9349</td>\n",
       "      <td>Halos is home. I have been here numerous times...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9686</td>\n",
       "      <td>The best restaurant I have ever been was a sma...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   pos        pos    0.9840   \n",
       "1   pos        pos    0.9702   \n",
       "2   pos        pos    0.9106   \n",
       "3   pos        pos    0.9349   \n",
       "4   pos        pos    0.9686   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  This restaurant ROCKS! I mean the food is grea...      yes  \n",
       "1  Stronghearts cafe is the BEST! The owners have...      yes  \n",
       "2  I went to cruise dinner in NYC with Spirit Cru...      yes  \n",
       "3  Halos is home. I have been here numerous times...      yes  \n",
       "4  The best restaurant I have ever been was a sma...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 26 out of 46 0.5652173913043478\n",
      "CORRECT PREDICT TRUE: 45 out of 46 0.9782608695652174\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(neg_hw4, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(pos_hw4, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 7: HW4 [Deception]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9328</td>\n",
       "      <td>Gannon’s Isle Ice Cream served the best ice cr...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8885</td>\n",
       "      <td>Hibachi the grill is one of my favorite restau...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9915</td>\n",
       "      <td>RIM KAAP One of the best Thai restaurants in t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8625</td>\n",
       "      <td>It is a France restaurant which has Michelin t...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>neg</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.9360</td>\n",
       "      <td>Its hard to pick a favorite dining experience ...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   neg        pos    0.9328   \n",
       "1   neg        pos    0.8885   \n",
       "2   neg        pos    0.9915   \n",
       "3   neg        pos    0.8625   \n",
       "4   neg        pos    0.9360   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0  Gannon’s Isle Ice Cream served the best ice cr...       no  \n",
       "1  Hibachi the grill is one of my favorite restau...       no  \n",
       "2  RIM KAAP One of the best Thai restaurants in t...       no  \n",
       "3  It is a France restaurant which has Michelin t...       no  \n",
       "4  Its hard to pick a favorite dining experience ...       no  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>prediction</th>\n",
       "      <th>compound</th>\n",
       "      <th>excerpt</th>\n",
       "      <th>accurate</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>?</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.8321</td>\n",
       "      <td>Twin Trees Cicero NY HUGE salad bar and high q...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>pos</td>\n",
       "      <td>neg</td>\n",
       "      <td>-0.8641</td>\n",
       "      <td>The worst restaurant that I have ever eaten in...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.0000</td>\n",
       "      <td>?</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>pos</td>\n",
       "      <td>pos</td>\n",
       "      <td>0.5267</td>\n",
       "      <td>I have been to a Asian restaurant in New York ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label prediction  compound  \\\n",
       "0   pos        pos    0.0000   \n",
       "1   pos        pos    0.8321   \n",
       "2   pos        neg   -0.8641   \n",
       "3   pos        pos    0.0000   \n",
       "4   pos        pos    0.5267   \n",
       "\n",
       "                                             excerpt accurate  \n",
       "0                                                  ?      yes  \n",
       "1  Twin Trees Cicero NY HUGE salad bar and high q...      yes  \n",
       "2  The worst restaurant that I have ever eaten in...       no  \n",
       "3                                                  ?      yes  \n",
       "4  I have been to a Asian restaurant in New York ...      yes  "
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "CORRECT PREDICT FALSE: 13 out of 46 0.2826086956521739\n",
      "CORRECT PREDICT TRUE: 32 out of 46 0.6956521739130435\n"
     ]
    }
   ],
   "source": [
    "df_n = pd.DataFrame(get_vader_scores(false_lie_hw4, 'neg'))\n",
    "df_p = pd.DataFrame(get_vader_scores(true_lie_hw4, 'pos'))\n",
    "\n",
    "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n",
    "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n",
    "\n",
    "display(df_n[:5])\n",
    "display(df_p[:5])\n",
    "\n",
    "sum_correct_n = (df_n['accurate']=='yes').sum()\n",
    "sum_correct_p = (df_p['accurate']=='yes').sum()\n",
    "\n",
    "print('CORRECT PREDICT FALSE:', sum_correct_n, 'out of', len(df_n), sum_correct_n/len(df_n))\n",
    "print('CORRECT PREDICT TRUE:', sum_correct_p, 'out of', len(df_p), sum_correct_p/len(df_p))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# NLTK with NaiveBayes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.classify import NaiveBayesClassifier\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *\n",
    "\n",
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "def get_nltk_train_test(array, label, num_train):\n",
    "    tokens = [get_tokens(sentence) for sentence in array]\n",
    "    docs = [(sent, label) for sent in tokens]\n",
    "    train_docs = docs[:num_train]\n",
    "    test_docs = docs[num_train:len(array)]\n",
    "    return [train_docs, test_docs]\n",
    "\n",
    "\n",
    "def get_nltk_NB(NEG_DATA, POS_DATA, num_train):\n",
    "    train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg', num_train)\n",
    "    train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos', num_train)\n",
    "\n",
    "    training_docs = train_neg + train_pos\n",
    "    testing_docs = test_neg + test_pos\n",
    "\n",
    "    sentim_analyzer = SentimentAnalyzer()\n",
    "    all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])\n",
    "    unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)\n",
    "    sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)\n",
    "    training_set = sentim_analyzer.apply_features(training_docs)\n",
    "    test_set = sentim_analyzer.apply_features(testing_docs)\n",
    "\n",
    "    trainer = NaiveBayesClassifier.train\n",
    "    classifier = sentim_analyzer.train(trainer, training_set)\n",
    "    \n",
    "    results = []\n",
    "    for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):\n",
    "        print('{0}: {1}'.format(key,value))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 1: Kendra's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 1.0\n",
      "F-measure [neg]: 1.0\n",
      "F-measure [pos]: 1.0\n",
      "Precision [neg]: 1.0\n",
      "Precision [pos]: 1.0\n",
      "Recall [neg]: 1.0\n",
      "Recall [pos]: 1.0\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_k, pos_k, 4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 2: Ami's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.5\n",
      "F-measure [neg]: 0.6666666666666666\n",
      "F-measure [pos]: None\n",
      "Precision [neg]: 0.5\n",
      "Precision [pos]: None\n",
      "Recall [neg]: 1.0\n",
      "Recall [pos]: 0.0\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_a, pos_a, 4)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 3: Cornell's Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.8125\n",
      "F-measure [neg]: 0.8259860788863109\n",
      "F-measure [pos]: 0.7967479674796748\n",
      "Precision [neg]: 0.7705627705627706\n",
      "Precision [pos]: 0.8698224852071006\n",
      "Recall [neg]: 0.89\n",
      "Recall [pos]: 0.735\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_cornell, pos_cornell, 800)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 4: Dirty Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.7775\n",
      "F-measure [neg]: 0.7944572748267898\n",
      "F-measure [pos]: 0.757493188010899\n",
      "Precision [neg]: 0.7381974248927039\n",
      "Precision [pos]: 0.8323353293413174\n",
      "Recall [neg]: 0.86\n",
      "Recall [pos]: 0.695\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_dirty, pos_dirty, 800)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 5: Joker Review Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.581081081081081\n",
      "F-measure [neg]: 0.6593406593406593\n",
      "F-measure [pos]: 0.456140350877193\n",
      "Precision [neg]: 0.5555555555555556\n",
      "Precision [pos]: 0.65\n",
      "Recall [neg]: 0.8108108108108109\n",
      "Recall [pos]: 0.35135135135135137\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_joker, pos_joker, 86)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 6: HW4 [Sentiment]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.75\n",
      "F-measure [neg]: 0.6956521739130435\n",
      "F-measure [pos]: 0.787878787878788\n",
      "Precision [neg]: 0.8888888888888888\n",
      "Precision [pos]: 0.6842105263157895\n",
      "Recall [neg]: 0.5714285714285714\n",
      "Recall [pos]: 0.9285714285714286\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(neg_hw4, pos_hw4, 32)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CASE STUDY 7: HW4 [Deception]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n",
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 0.5714285714285714\n",
      "F-measure [neg]: 0.5714285714285714\n",
      "F-measure [pos]: 0.5714285714285714\n",
      "Precision [neg]: 0.5714285714285714\n",
      "Precision [pos]: 0.5714285714285714\n",
      "Recall [neg]: 0.5714285714285714\n",
      "Recall [pos]: 0.5714285714285714\n"
     ]
    }
   ],
   "source": [
    "get_nltk_NB(false_lie_hw4, true_lie_hw4, 32)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "list"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "type(false_lie_hw4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Gannon’s Isle Ice Cream served the best ice cream and you better believe it! The place is ideally situated and it is easy to get too. The ice cream is delicious the best I had. There were so many varieties that I had trouble choosing it. I had the chocolate and raspberry. A weird combination but the smooth sweet chocolate combined with the sharp taste of raspberry was devine! Try it!'"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "false_lie_hw4[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}