{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW3 JOKER EXTREMES"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 1: Import Data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file)\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "neg = get_data_from_files('../NEG_JK_E/')\n",
    "pos = get_data_from_files('../POS_JK_E/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Neg Reviews: 48\n",
      "Pos Reviews: 50\n"
     ]
    }
   ],
   "source": [
    "print('Neg Reviews:', len(neg))\n",
    "print('Pos Reviews:', len(pos))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 2: Turn into DF & Label it"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "neg_df = pd.DataFrame(neg)\n",
    "pos_df = pd.DataFrame(pos)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Add labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "pos_df['PoN'] = 'P'\n",
    "neg_df['PoN'] = 'N'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df = neg_df.append(pos_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>Everyone praised an overrated movie.\\nOverrat...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>What idiotic FIlm\\nI can say that Phoenix is ...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>Terrible\\nThe only thing good about this movi...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>Watch Taxi Driver instead\\nThis is a poor att...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>I learned one thing.\\nIt borrows a lot of ele...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN\n",
       "0   Everyone praised an overrated movie.\\nOverrat...   N\n",
       "1   What idiotic FIlm\\nI can say that Phoenix is ...   N\n",
       "2   Terrible\\nThe only thing good about this movi...   N\n",
       "3   Watch Taxi Driver instead\\nThis is a poor att...   N\n",
       "4   I learned one thing.\\nIt borrows a lot of ele...   N"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 3: Tokenize and Clean!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n",
    "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>Everyone praised an overrated movie.\\nOverrat...</td>\n",
       "      <td>N</td>\n",
       "      <td>[everyone, praised, an, overrated, movie, over...</td>\n",
       "      <td>26</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>What idiotic FIlm\\nI can say that Phoenix is ...</td>\n",
       "      <td>N</td>\n",
       "      <td>[what, idiotic, film, i, can, say, that, phoen...</td>\n",
       "      <td>66</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>Terrible\\nThe only thing good about this movi...</td>\n",
       "      <td>N</td>\n",
       "      <td>[terrible, the, only, thing, good, about, this...</td>\n",
       "      <td>124</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>Watch Taxi Driver instead\\nThis is a poor att...</td>\n",
       "      <td>N</td>\n",
       "      <td>[watch, taxi, driver, instead, this, is, a, po...</td>\n",
       "      <td>123</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>I learned one thing.\\nIt borrows a lot of ele...</td>\n",
       "      <td>N</td>\n",
       "      <td>[i, learned, one, thing, it, borrows, a, lot, ...</td>\n",
       "      <td>70</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN  \\\n",
       "0   Everyone praised an overrated movie.\\nOverrat...   N   \n",
       "1   What idiotic FIlm\\nI can say that Phoenix is ...   N   \n",
       "2   Terrible\\nThe only thing good about this movi...   N   \n",
       "3   Watch Taxi Driver instead\\nThis is a poor att...   N   \n",
       "4   I learned one thing.\\nIt borrows a lot of ele...   N   \n",
       "\n",
       "                                              tokens  num_tokens  \n",
       "0  [everyone, praised, an, overrated, movie, over...          26  \n",
       "1  [what, idiotic, film, i, can, say, that, phoen...          66  \n",
       "2  [terrible, the, only, thing, good, about, this...         124  \n",
       "3  [watch, taxi, driver, instead, this, is, a, po...         123  \n",
       "4  [i, learned, one, thing, it, borrows, a, lot, ...          70  "
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 4: Create Bag of Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import casual_tokenize\n",
    "from collections import Counter\n",
    "# all_df['bow'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)\n",
    "all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>bow</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>Everyone praised an overrated movie.\\nOverrat...</td>\n",
       "      <td>N</td>\n",
       "      <td>[everyone, praised, an, overrated, movie, over...</td>\n",
       "      <td>26</td>\n",
       "      <td>{'everyone': 1, 'praised': 1, 'an': 1, 'overra...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>What idiotic FIlm\\nI can say that Phoenix is ...</td>\n",
       "      <td>N</td>\n",
       "      <td>[what, idiotic, film, i, can, say, that, phoen...</td>\n",
       "      <td>66</td>\n",
       "      <td>{'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>Terrible\\nThe only thing good about this movi...</td>\n",
       "      <td>N</td>\n",
       "      <td>[terrible, the, only, thing, good, about, this...</td>\n",
       "      <td>124</td>\n",
       "      <td>{'terrible': 3, 'the': 5, 'only': 1, 'thing': ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>Watch Taxi Driver instead\\nThis is a poor att...</td>\n",
       "      <td>N</td>\n",
       "      <td>[watch, taxi, driver, instead, this, is, a, po...</td>\n",
       "      <td>123</td>\n",
       "      <td>{'watch': 1, 'taxi': 2, 'driver': 2, 'instead'...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>I learned one thing.\\nIt borrows a lot of ele...</td>\n",
       "      <td>N</td>\n",
       "      <td>[i, learned, one, thing, it, borrows, a, lot, ...</td>\n",
       "      <td>70</td>\n",
       "      <td>{'i': 1, 'learned': 1, 'one': 1, 'thing': 1, '...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN  \\\n",
       "0   Everyone praised an overrated movie.\\nOverrat...   N   \n",
       "1   What idiotic FIlm\\nI can say that Phoenix is ...   N   \n",
       "2   Terrible\\nThe only thing good about this movi...   N   \n",
       "3   Watch Taxi Driver instead\\nThis is a poor att...   N   \n",
       "4   I learned one thing.\\nIt borrows a lot of ele...   N   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0  [everyone, praised, an, overrated, movie, over...          26   \n",
       "1  [what, idiotic, film, i, can, say, that, phoen...          66   \n",
       "2  [terrible, the, only, thing, good, about, this...         124   \n",
       "3  [watch, taxi, driver, instead, this, is, a, po...         123   \n",
       "4  [i, learned, one, thing, it, borrows, a, lot, ...          70   \n",
       "\n",
       "                                                 bow  \n",
       "0  {'everyone': 1, 'praised': 1, 'an': 1, 'overra...  \n",
       "1  {'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '...  \n",
       "2  {'terrible': 3, 'the': 5, 'only': 1, 'thing': ...  \n",
       "3  {'watch': 1, 'taxi': 2, 'driver': 2, 'instead'...  \n",
       "4  {'i': 1, 'learned': 1, 'one': 1, 'thing': 1, '...  "
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 5: Vectorize -- Create a Frequency Distribution Matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>everyone</th>\n",
       "      <th>praised</th>\n",
       "      <th>an</th>\n",
       "      <th>overrated</th>\n",
       "      <th>movie</th>\n",
       "      <th>of</th>\n",
       "      <th>all</th>\n",
       "      <th>time</th>\n",
       "      <th>the</th>\n",
       "      <th>reviews</th>\n",
       "      <th>...</th>\n",
       "      <th>easy</th>\n",
       "      <th>answers</th>\n",
       "      <th>questions</th>\n",
       "      <th>raises</th>\n",
       "      <th>albeit</th>\n",
       "      <th>reinvention</th>\n",
       "      <th>source</th>\n",
       "      <th>material</th>\n",
       "      <th>alike</th>\n",
       "      <th>disturbed</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>4</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>3</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>9</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 2648 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   everyone  praised  an  overrated  movie  of  all  time  the  reviews  ...  \\\n",
       "0         1        1   1          2      2   1    1     1    1        1  ...   \n",
       "1         0        0   0          0      2   0    0     0    2        0  ...   \n",
       "2         0        0   0          1      4   2    0     0    5        0  ...   \n",
       "3         0        0   0          0      3   5    0     0    9        0  ...   \n",
       "4         1        0   1          0      1   2    0     0    1        0  ...   \n",
       "\n",
       "   easy  answers  questions  raises  albeit  reinvention  source  material  \\\n",
       "0     0        0          0       0       0            0       0         0   \n",
       "1     0        0          0       0       0            0       0         0   \n",
       "2     0        0          0       0       0            0       0         0   \n",
       "3     0        0          0       0       0            0       0         0   \n",
       "4     0        0          0       0       0            0       0         0   \n",
       "\n",
       "   alike  disturbed  \n",
       "0      0          0  \n",
       "1      0          0  \n",
       "2      0          0  \n",
       "3      0          0  \n",
       "4      0          0  \n",
       "\n",
       "[5 rows x 2648 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "freq_df = pd.DataFrame(all_df['bow'].tolist())\n",
    "freq_df = freq_df.fillna(0).astype(int)\n",
    "freq_df[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 6: Normalize"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### With simple weights"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### With TFIDF"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfTransformer\n",
    "tfidf =TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)\n",
    "data =tfidf.fit_transform(freq_df.values)\n",
    "tfidf_reduced = pd.DataFrame(data.todense())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>1</th>\n",
       "      <th>2</th>\n",
       "      <th>3</th>\n",
       "      <th>4</th>\n",
       "      <th>5</th>\n",
       "      <th>6</th>\n",
       "      <th>7</th>\n",
       "      <th>8</th>\n",
       "      <th>9</th>\n",
       "      <th>...</th>\n",
       "      <th>2638</th>\n",
       "      <th>2639</th>\n",
       "      <th>2640</th>\n",
       "      <th>2641</th>\n",
       "      <th>2642</th>\n",
       "      <th>2643</th>\n",
       "      <th>2644</th>\n",
       "      <th>2645</th>\n",
       "      <th>2646</th>\n",
       "      <th>2647</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>0.200322</td>\n",
       "      <td>0.340128</td>\n",
       "      <td>0.127248</td>\n",
       "      <td>0.553100</td>\n",
       "      <td>0.190896</td>\n",
       "      <td>0.083310</td>\n",
       "      <td>0.130553</td>\n",
       "      <td>0.162156</td>\n",
       "      <td>0.073724</td>\n",
       "      <td>0.221842</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.112320</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.086756</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.106806</td>\n",
       "      <td>0.147451</td>\n",
       "      <td>0.064349</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.142363</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.109289</td>\n",
       "      <td>0.158984</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.253245</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>0.106888</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.067897</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.050929</td>\n",
       "      <td>0.088905</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.039338</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>...</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 2648 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "       0         1         2         3         4         5         6     \\\n",
       "0  0.200322  0.340128  0.127248  0.553100  0.190896  0.083310  0.130553   \n",
       "1  0.000000  0.000000  0.000000  0.000000  0.112320  0.000000  0.000000   \n",
       "2  0.000000  0.000000  0.000000  0.106806  0.147451  0.064349  0.000000   \n",
       "3  0.000000  0.000000  0.000000  0.000000  0.109289  0.158984  0.000000   \n",
       "4  0.106888  0.000000  0.067897  0.000000  0.050929  0.088905  0.000000   \n",
       "\n",
       "       7         8         9     ...  2638  2639  2640  2641  2642  2643  \\\n",
       "0  0.162156  0.073724  0.221842  ...   0.0   0.0   0.0   0.0   0.0   0.0   \n",
       "1  0.000000  0.086756  0.000000  ...   0.0   0.0   0.0   0.0   0.0   0.0   \n",
       "2  0.000000  0.142363  0.000000  ...   0.0   0.0   0.0   0.0   0.0   0.0   \n",
       "3  0.000000  0.253245  0.000000  ...   0.0   0.0   0.0   0.0   0.0   0.0   \n",
       "4  0.000000  0.039338  0.000000  ...   0.0   0.0   0.0   0.0   0.0   0.0   \n",
       "\n",
       "   2644  2645  2646  2647  \n",
       "0   0.0   0.0   0.0   0.0  \n",
       "1   0.0   0.0   0.0   0.0  \n",
       "2   0.0   0.0   0.0   0.0  \n",
       "3   0.0   0.0   0.0   0.0  \n",
       "4   0.0   0.0   0.0   0.0  \n",
       "\n",
       "[5 rows x 2648 columns]"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "tfidf_reduced[:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## STEP 7: Test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import GaussianNB\n",
    "\n",
    "def get_NB(small_df, labels):\n",
    "    x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n",
    "\n",
    "    gnb = GaussianNB()\n",
    "    gnb.fit(x_train, y_train)\n",
    "    y_pred = gnb.predict(x_test)\n",
    "    from sklearn import metrics\n",
    "    print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.7666666666666667\n"
     ]
    }
   ],
   "source": [
    "get_NB(tfidf_reduced, all_df['PoN'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
