{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import nltk\n",
    "# from nltk.tokenize import word_tokenize\n",
    "# from nltk.probability import FreqDist\n",
    "# file = open('WK2/moviereview_arff.arff')\n",
    "# tokens = []\n",
    "# for line in file:\n",
    "# #     print(type(line))\n",
    "# #     tokens.append(word_tokenize(line))\n",
    "# len(tokens)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "from nltk.tokenize import word_tokenize\n",
    "from nltk.probability import FreqDist\n",
    "import pandas as pd\n",
    "file = open('WK2/moviereview.csv')\n",
    "all_df = pd.DataFrame(file)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# freq_dist_sent = []\n",
    "# for sent in tokenized_sentences[1:2]:\n",
    "#     fdist = FreqDist(sent)\n",
    "#     print(len(sent))\n",
    "#     print(fdist['bad'])\n",
    "# #     print(fdist.items())\n",
    "# #     print(sent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "all_df['tokenized'] = all_df.apply(lambda x: get_tokens(x[0]),axis=1)\n",
    "all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>tokenized</th>\n",
       "      <th>tokenized_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>text,reviewclass\\n</td>\n",
       "      <td>[text, reviewclass]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>'plot : two teen couples go to a church party ...</td>\n",
       "      <td>[two, teen, couples, go, to, a, church, party,...</td>\n",
       "      <td>638</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>'the happy bastard\\'s quick movie review \\ndam...</td>\n",
       "      <td>[happy, quick, movie, review, that, bug, got, ...</td>\n",
       "      <td>215</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>'it is movies like these that make a jaded mov...</td>\n",
       "      <td>[is, movies, like, these, that, make, a, jaded...</td>\n",
       "      <td>444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>' \\\" quest for camelot \\\" is warner bros . \\' ...</td>\n",
       "      <td>[quest, for, camelot, is, warner, bros, first,...</td>\n",
       "      <td>410</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1996</td>\n",
       "      <td>'wow ! what a movie . \\nit\\'s everything a mov...</td>\n",
       "      <td>[what, a, movie, everything, a, movie, can, be...</td>\n",
       "      <td>702</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1997</td>\n",
       "      <td>'richard gere can be a commanding actor , but ...</td>\n",
       "      <td>[gere, can, be, a, commanding, actor, but, not...</td>\n",
       "      <td>286</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1998</td>\n",
       "      <td>'glory--starring matthew broderick , denzel wa...</td>\n",
       "      <td>[starring, matthew, broderick, denzel, washing...</td>\n",
       "      <td>990</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1999</td>\n",
       "      <td>'steven spielberg\\'s second epic film on world...</td>\n",
       "      <td>[second, epic, film, on, world, war, ii, is, a...</td>\n",
       "      <td>538</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2000</td>\n",
       "      <td>'truman ( \\\" true-man \\\" ) burbank is the perf...</td>\n",
       "      <td>[burbank, is, the, perfect, name, for, jim, ch...</td>\n",
       "      <td>901</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2001 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      0  \\\n",
       "0                                    text,reviewclass\\n   \n",
       "1     'plot : two teen couples go to a church party ...   \n",
       "2     'the happy bastard\\'s quick movie review \\ndam...   \n",
       "3     'it is movies like these that make a jaded mov...   \n",
       "4     ' \\\" quest for camelot \\\" is warner bros . \\' ...   \n",
       "...                                                 ...   \n",
       "1996  'wow ! what a movie . \\nit\\'s everything a mov...   \n",
       "1997  'richard gere can be a commanding actor , but ...   \n",
       "1998  'glory--starring matthew broderick , denzel wa...   \n",
       "1999  'steven spielberg\\'s second epic film on world...   \n",
       "2000  'truman ( \\\" true-man \\\" ) burbank is the perf...   \n",
       "\n",
       "                                              tokenized  tokenized_count  \n",
       "0                                   [text, reviewclass]                2  \n",
       "1     [two, teen, couples, go, to, a, church, party,...              638  \n",
       "2     [happy, quick, movie, review, that, bug, got, ...              215  \n",
       "3     [is, movies, like, these, that, make, a, jaded...              444  \n",
       "4     [quest, for, camelot, is, warner, bros, first,...              410  \n",
       "...                                                 ...              ...  \n",
       "1996  [what, a, movie, everything, a, movie, can, be...              702  \n",
       "1997  [gere, can, be, a, commanding, actor, but, not...              286  \n",
       "1998  [starring, matthew, broderick, denzel, washing...              990  \n",
       "1999  [second, epic, film, on, world, war, ii, is, a...              538  \n",
       "2000  [burbank, is, the, perfect, name, for, jim, ch...              901  \n",
       "\n",
       "[2001 rows x 3 columns]"
      ]
     },
     "execution_count": 48,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "def remove_stopwords(sentence):\n",
    "    filtered_text = []\n",
    "    for word in sentence:\n",
    "        if word not in stop_words:\n",
    "            filtered_text.append(word)\n",
    "    return filtered_text\n",
    "all_df['no_stopwords'] = all_df.apply(lambda x: remove_stopwords(x['tokenized']),axis=1)\n",
    "all_df['no_stopwords_count'] = all_df.apply(lambda x: len(x['no_stopwords']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>tokenized</th>\n",
       "      <th>tokenized_count</th>\n",
       "      <th>no_stopwords</th>\n",
       "      <th>no_stopwords_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>text,reviewclass\\n</td>\n",
       "      <td>[text, reviewclass]</td>\n",
       "      <td>2</td>\n",
       "      <td>[text, reviewclass]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>'plot : two teen couples go to a church party ...</td>\n",
       "      <td>[two, teen, couples, go, to, a, church, party,...</td>\n",
       "      <td>638</td>\n",
       "      <td>[two, teen, couples, go, church, party, drink,...</td>\n",
       "      <td>306</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>'the happy bastard\\'s quick movie review \\ndam...</td>\n",
       "      <td>[happy, quick, movie, review, that, bug, got, ...</td>\n",
       "      <td>215</td>\n",
       "      <td>[happy, quick, movie, review, bug, got, head, ...</td>\n",
       "      <td>119</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>'it is movies like these that make a jaded mov...</td>\n",
       "      <td>[is, movies, like, these, that, make, a, jaded...</td>\n",
       "      <td>444</td>\n",
       "      <td>[movies, like, make, jaded, movie, viewer, tha...</td>\n",
       "      <td>246</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>' \\\" quest for camelot \\\" is warner bros . \\' ...</td>\n",
       "      <td>[quest, for, camelot, is, warner, bros, first,...</td>\n",
       "      <td>410</td>\n",
       "      <td>[quest, camelot, warner, bros, first, attempt,...</td>\n",
       "      <td>234</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1996</td>\n",
       "      <td>'wow ! what a movie . \\nit\\'s everything a mov...</td>\n",
       "      <td>[what, a, movie, everything, a, movie, can, be...</td>\n",
       "      <td>702</td>\n",
       "      <td>[movie, everything, movie, funny, dramatic, in...</td>\n",
       "      <td>355</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1997</td>\n",
       "      <td>'richard gere can be a commanding actor , but ...</td>\n",
       "      <td>[gere, can, be, a, commanding, actor, but, not...</td>\n",
       "      <td>286</td>\n",
       "      <td>[gere, commanding, actor, always, great, films...</td>\n",
       "      <td>148</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1998</td>\n",
       "      <td>'glory--starring matthew broderick , denzel wa...</td>\n",
       "      <td>[starring, matthew, broderick, denzel, washing...</td>\n",
       "      <td>990</td>\n",
       "      <td>[starring, matthew, broderick, denzel, washing...</td>\n",
       "      <td>561</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1999</td>\n",
       "      <td>'steven spielberg\\'s second epic film on world...</td>\n",
       "      <td>[second, epic, film, on, world, war, ii, is, a...</td>\n",
       "      <td>538</td>\n",
       "      <td>[second, epic, film, world, war, ii, unquestio...</td>\n",
       "      <td>287</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2000</td>\n",
       "      <td>'truman ( \\\" true-man \\\" ) burbank is the perf...</td>\n",
       "      <td>[burbank, is, the, perfect, name, for, jim, ch...</td>\n",
       "      <td>901</td>\n",
       "      <td>[burbank, perfect, name, jim, character, film,...</td>\n",
       "      <td>483</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2001 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      0  \\\n",
       "0                                    text,reviewclass\\n   \n",
       "1     'plot : two teen couples go to a church party ...   \n",
       "2     'the happy bastard\\'s quick movie review \\ndam...   \n",
       "3     'it is movies like these that make a jaded mov...   \n",
       "4     ' \\\" quest for camelot \\\" is warner bros . \\' ...   \n",
       "...                                                 ...   \n",
       "1996  'wow ! what a movie . \\nit\\'s everything a mov...   \n",
       "1997  'richard gere can be a commanding actor , but ...   \n",
       "1998  'glory--starring matthew broderick , denzel wa...   \n",
       "1999  'steven spielberg\\'s second epic film on world...   \n",
       "2000  'truman ( \\\" true-man \\\" ) burbank is the perf...   \n",
       "\n",
       "                                              tokenized  tokenized_count  \\\n",
       "0                                   [text, reviewclass]                2   \n",
       "1     [two, teen, couples, go, to, a, church, party,...              638   \n",
       "2     [happy, quick, movie, review, that, bug, got, ...              215   \n",
       "3     [is, movies, like, these, that, make, a, jaded...              444   \n",
       "4     [quest, for, camelot, is, warner, bros, first,...              410   \n",
       "...                                                 ...              ...   \n",
       "1996  [what, a, movie, everything, a, movie, can, be...              702   \n",
       "1997  [gere, can, be, a, commanding, actor, but, not...              286   \n",
       "1998  [starring, matthew, broderick, denzel, washing...              990   \n",
       "1999  [second, epic, film, on, world, war, ii, is, a...              538   \n",
       "2000  [burbank, is, the, perfect, name, for, jim, ch...              901   \n",
       "\n",
       "                                           no_stopwords  no_stopwords_count  \n",
       "0                                   [text, reviewclass]                   2  \n",
       "1     [two, teen, couples, go, church, party, drink,...                 306  \n",
       "2     [happy, quick, movie, review, bug, got, head, ...                 119  \n",
       "3     [movies, like, make, jaded, movie, viewer, tha...                 246  \n",
       "4     [quest, camelot, warner, bros, first, attempt,...                 234  \n",
       "...                                                 ...                 ...  \n",
       "1996  [movie, everything, movie, funny, dramatic, in...                 355  \n",
       "1997  [gere, commanding, actor, always, great, films...                 148  \n",
       "1998  [starring, matthew, broderick, denzel, washing...                 561  \n",
       "1999  [second, epic, film, world, war, ii, unquestio...                 287  \n",
       "2000  [burbank, perfect, name, jim, character, film,...                 483  \n",
       "\n",
       "[2001 rows x 5 columns]"
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.probability import FreqDist\n",
    "def get_most_common(tokens):\n",
    "    fdist = FreqDist(tokens)\n",
    "    return fdist\n",
    "all_df['fdist'] = all_df.apply(lambda x: get_most_common(x['no_stopwords']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df = all_df[1:]\n",
    "# all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n",
      "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  from ipykernel import kernelapp as app\n"
     ]
    }
   ],
   "source": [
    "# In the 2000 docs, how many times was \"bad\" used\n",
    "# inverse of the normalized value\n",
    "def get_bad(fdist): \n",
    "#     fdist['bad']\n",
    "    return fdist['bad']\n",
    "\n",
    "# import math\n",
    "# (math.log10(2000/760))\n",
    "# print((all_df['bad']!=0).sum())\n",
    "\n",
    "def get_tfidf(fdist): \n",
    "    return fdist['bad']*(math.log10(2000/760))\n",
    "\n",
    "all_df['bad'] = all_df.apply(lambda x: get_bad(x['fdist']),axis=1)\n",
    "all_df['tfidf_bad'] = all_df.apply(lambda x: get_tfidf(x['fdist']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>tokenized</th>\n",
       "      <th>tokenized_count</th>\n",
       "      <th>no_stopwords</th>\n",
       "      <th>no_stopwords_count</th>\n",
       "      <th>fdist</th>\n",
       "      <th>bad</th>\n",
       "      <th>tfidf_bad</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>'plot : two teen couples go to a church party ...</td>\n",
       "      <td>[two, teen, couples, go, to, a, church, party,...</td>\n",
       "      <td>638</td>\n",
       "      <td>[two, teen, couples, go, church, party, drink,...</td>\n",
       "      <td>306</td>\n",
       "      <td>{'two': 2, 'teen': 4, 'couples': 1, 'go': 2, '...</td>\n",
       "      <td>2</td>\n",
       "      <td>0.840433</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>'the happy bastard\\'s quick movie review \\ndam...</td>\n",
       "      <td>[happy, quick, movie, review, that, bug, got, ...</td>\n",
       "      <td>215</td>\n",
       "      <td>[happy, quick, movie, review, bug, got, head, ...</td>\n",
       "      <td>119</td>\n",
       "      <td>{'happy': 1, 'quick': 1, 'movie': 5, 'review':...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>'it is movies like these that make a jaded mov...</td>\n",
       "      <td>[is, movies, like, these, that, make, a, jaded...</td>\n",
       "      <td>444</td>\n",
       "      <td>[movies, like, make, jaded, movie, viewer, tha...</td>\n",
       "      <td>246</td>\n",
       "      <td>{'movies': 1, 'like': 4, 'make': 2, 'jaded': 1...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>' \\\" quest for camelot \\\" is warner bros . \\' ...</td>\n",
       "      <td>[quest, for, camelot, is, warner, bros, first,...</td>\n",
       "      <td>410</td>\n",
       "      <td>[quest, camelot, warner, bros, first, attempt,...</td>\n",
       "      <td>234</td>\n",
       "      <td>{'quest': 5, 'camelot': 4, 'warner': 1, 'bros'...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>'synopsis : a mentally unstable man undergoing...</td>\n",
       "      <td>[a, mentally, unstable, man, undergoing, psych...</td>\n",
       "      <td>658</td>\n",
       "      <td>[mentally, unstable, man, undergoing, psychoth...</td>\n",
       "      <td>346</td>\n",
       "      <td>{'mentally': 1, 'unstable': 1, 'man': 2, 'unde...</td>\n",
       "      <td>2</td>\n",
       "      <td>0.840433</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1996</td>\n",
       "      <td>'wow ! what a movie . \\nit\\'s everything a mov...</td>\n",
       "      <td>[what, a, movie, everything, a, movie, can, be...</td>\n",
       "      <td>702</td>\n",
       "      <td>[movie, everything, movie, funny, dramatic, in...</td>\n",
       "      <td>355</td>\n",
       "      <td>{'movie': 14, 'everything': 2, 'funny': 5, 'dr...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1997</td>\n",
       "      <td>'richard gere can be a commanding actor , but ...</td>\n",
       "      <td>[gere, can, be, a, commanding, actor, but, not...</td>\n",
       "      <td>286</td>\n",
       "      <td>[gere, commanding, actor, always, great, films...</td>\n",
       "      <td>148</td>\n",
       "      <td>{'gere': 1, 'commanding': 1, 'actor': 1, 'alwa...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1998</td>\n",
       "      <td>'glory--starring matthew broderick , denzel wa...</td>\n",
       "      <td>[starring, matthew, broderick, denzel, washing...</td>\n",
       "      <td>990</td>\n",
       "      <td>[starring, matthew, broderick, denzel, washing...</td>\n",
       "      <td>561</td>\n",
       "      <td>{'starring': 1, 'matthew': 1, 'broderick': 2, ...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1999</td>\n",
       "      <td>'steven spielberg\\'s second epic film on world...</td>\n",
       "      <td>[second, epic, film, on, world, war, ii, is, a...</td>\n",
       "      <td>538</td>\n",
       "      <td>[second, epic, film, world, war, ii, unquestio...</td>\n",
       "      <td>287</td>\n",
       "      <td>{'second': 1, 'epic': 2, 'film': 14, 'world': ...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2000</td>\n",
       "      <td>'truman ( \\\" true-man \\\" ) burbank is the perf...</td>\n",
       "      <td>[burbank, is, the, perfect, name, for, jim, ch...</td>\n",
       "      <td>901</td>\n",
       "      <td>[burbank, perfect, name, jim, character, film,...</td>\n",
       "      <td>483</td>\n",
       "      <td>{'burbank': 4, 'perfect': 4, 'name': 1, 'jim':...</td>\n",
       "      <td>0</td>\n",
       "      <td>0.000000</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>2000 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                      0  \\\n",
       "1     'plot : two teen couples go to a church party ...   \n",
       "2     'the happy bastard\\'s quick movie review \\ndam...   \n",
       "3     'it is movies like these that make a jaded mov...   \n",
       "4     ' \\\" quest for camelot \\\" is warner bros . \\' ...   \n",
       "5     'synopsis : a mentally unstable man undergoing...   \n",
       "...                                                 ...   \n",
       "1996  'wow ! what a movie . \\nit\\'s everything a mov...   \n",
       "1997  'richard gere can be a commanding actor , but ...   \n",
       "1998  'glory--starring matthew broderick , denzel wa...   \n",
       "1999  'steven spielberg\\'s second epic film on world...   \n",
       "2000  'truman ( \\\" true-man \\\" ) burbank is the perf...   \n",
       "\n",
       "                                              tokenized  tokenized_count  \\\n",
       "1     [two, teen, couples, go, to, a, church, party,...              638   \n",
       "2     [happy, quick, movie, review, that, bug, got, ...              215   \n",
       "3     [is, movies, like, these, that, make, a, jaded...              444   \n",
       "4     [quest, for, camelot, is, warner, bros, first,...              410   \n",
       "5     [a, mentally, unstable, man, undergoing, psych...              658   \n",
       "...                                                 ...              ...   \n",
       "1996  [what, a, movie, everything, a, movie, can, be...              702   \n",
       "1997  [gere, can, be, a, commanding, actor, but, not...              286   \n",
       "1998  [starring, matthew, broderick, denzel, washing...              990   \n",
       "1999  [second, epic, film, on, world, war, ii, is, a...              538   \n",
       "2000  [burbank, is, the, perfect, name, for, jim, ch...              901   \n",
       "\n",
       "                                           no_stopwords  no_stopwords_count  \\\n",
       "1     [two, teen, couples, go, church, party, drink,...                 306   \n",
       "2     [happy, quick, movie, review, bug, got, head, ...                 119   \n",
       "3     [movies, like, make, jaded, movie, viewer, tha...                 246   \n",
       "4     [quest, camelot, warner, bros, first, attempt,...                 234   \n",
       "5     [mentally, unstable, man, undergoing, psychoth...                 346   \n",
       "...                                                 ...                 ...   \n",
       "1996  [movie, everything, movie, funny, dramatic, in...                 355   \n",
       "1997  [gere, commanding, actor, always, great, films...                 148   \n",
       "1998  [starring, matthew, broderick, denzel, washing...                 561   \n",
       "1999  [second, epic, film, world, war, ii, unquestio...                 287   \n",
       "2000  [burbank, perfect, name, jim, character, film,...                 483   \n",
       "\n",
       "                                                  fdist  bad  tfidf_bad  \n",
       "1     {'two': 2, 'teen': 4, 'couples': 1, 'go': 2, '...    2   0.840433  \n",
       "2     {'happy': 1, 'quick': 1, 'movie': 5, 'review':...    0   0.000000  \n",
       "3     {'movies': 1, 'like': 4, 'make': 2, 'jaded': 1...    0   0.000000  \n",
       "4     {'quest': 5, 'camelot': 4, 'warner': 1, 'bros'...    0   0.000000  \n",
       "5     {'mentally': 1, 'unstable': 1, 'man': 2, 'unde...    2   0.840433  \n",
       "...                                                 ...  ...        ...  \n",
       "1996  {'movie': 14, 'everything': 2, 'funny': 5, 'dr...    0   0.000000  \n",
       "1997  {'gere': 1, 'commanding': 1, 'actor': 1, 'alwa...    0   0.000000  \n",
       "1998  {'starring': 1, 'matthew': 1, 'broderick': 2, ...    0   0.000000  \n",
       "1999  {'second': 1, 'epic': 2, 'film': 14, 'world': ...    0   0.000000  \n",
       "2000  {'burbank': 4, 'perfect': 4, 'name': 1, 'jim':...    0   0.000000  \n",
       "\n",
       "[2000 rows x 8 columns]"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "760\n"
     ]
    }
   ],
   "source": [
    "print((all_df['bad']!=0).sum())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4202164033831899"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import math\n",
    "(math.log10(2000/760))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n"
     ]
    }
   ],
   "source": [
    "all_df['removed'] = all_df['tokenized_count'] - all_df['no_stopwords_count']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "543035"
      ]
     },
     "execution_count": 61,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df['removed'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1189601"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df['tokenized_count'].sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.4564849895048844"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df['removed'].sum()/all_df['tokenized_count'].sum()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
