{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "negative = os.listdir('NEG/')\n",
    "positive = os.listdir('POS/')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "positive_alltext = []\n",
    "for file in positive:\n",
    "    f=open('POS/'+file)\n",
    "    content=f.read()\n",
    "    positive_alltext.append(content)\n",
    "negative_alltext = []\n",
    "for file in negative:\n",
    "    f=open('NEG/'+file)\n",
    "    content=f.read()\n",
    "    negative_alltext.append(content)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [],
   "source": [
    "positive_df = pd.DataFrame(positive_alltext)\n",
    "negative_df = pd.DataFrame(negative_alltext)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "positive_df['PoN'] = 'P'\n",
    "negative_df['PoN'] = 'N'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df = positive_df.append(negative_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>films adapted from comic books have had plenty...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>you've got mail works alot better than it dese...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>\" jaws \" is a rare film that grabs your atten...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>every now and then a movie comes along from a ...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>moviemaking is a lot like being the general ma...</td>\n",
       "      <td>P</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>that's exactly how long the movie felt to me ....</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>\" quest for camelot \" is warner bros . ' firs...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>synopsis : a mentally unstable man undergoing ...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>capsule : in 2176 on the planet mars police ta...</td>\n",
       "      <td>N</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN\n",
       "0  films adapted from comic books have had plenty...   P\n",
       "1  you've got mail works alot better than it dese...   P\n",
       "2   \" jaws \" is a rare film that grabs your atten...   P\n",
       "3  every now and then a movie comes along from a ...   P\n",
       "4  moviemaking is a lot like being the general ma...   P\n",
       "0  that's exactly how long the movie felt to me ....   N\n",
       "1   \" quest for camelot \" is warner bros . ' firs...   N\n",
       "2  so ask yourself what \" 8mm \" ( \" eight millime...   N\n",
       "3  synopsis : a mentally unstable man undergoing ...   N\n",
       "4  capsule : in 2176 on the planet mars police ta...   N"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "def tokenizer(sentence):\n",
    "    return word_tokenize(sentence)\n",
    "\n",
    "all_df['tokenized'] = all_df.apply(lambda x: tokenizer(x[0]),axis=1)\n",
    "all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "def stopword_remover(sentence):\n",
    "    filtered_text = []\n",
    "    for word in sentence:\n",
    "        if word not in stop_words:\n",
    "            filtered_text.append(word)\n",
    "    return filtered_text\n",
    "all_df['no_stopwords'] = all_df.apply(lambda x: stopword_remover(x['tokenized']),axis=1)\n",
    "all_df['no_stopwords_count'] = all_df.apply(lambda x: len(x['no_stopwords']),axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 78,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>tokenized</th>\n",
       "      <th>tokenized_count</th>\n",
       "      <th>no_stopwords</th>\n",
       "      <th>no_stopwords_count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>films adapted from comic books have had plenty...</td>\n",
       "      <td>P</td>\n",
       "      <td>[films, adapted, from, comic, books, have, had...</td>\n",
       "      <td>826</td>\n",
       "      <td>[films, adapted, comic, books, plenty, success...</td>\n",
       "      <td>540</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>you've got mail works alot better than it dese...</td>\n",
       "      <td>P</td>\n",
       "      <td>[you, 've, got, mail, works, alot, better, tha...</td>\n",
       "      <td>476</td>\n",
       "      <td>['ve, got, mail, works, alot, better, deserves...</td>\n",
       "      <td>267</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>\" jaws \" is a rare film that grabs your atten...</td>\n",
       "      <td>P</td>\n",
       "      <td>[``, jaws, ``, is, a, rare, film, that, grabs,...</td>\n",
       "      <td>1197</td>\n",
       "      <td>[``, jaws, ``, rare, film, grabs, attention, s...</td>\n",
       "      <td>756</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>every now and then a movie comes along from a ...</td>\n",
       "      <td>P</td>\n",
       "      <td>[every, now, and, then, a, movie, comes, along...</td>\n",
       "      <td>786</td>\n",
       "      <td>[every, movie, comes, along, suspect, studio, ...</td>\n",
       "      <td>484</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>moviemaking is a lot like being the general ma...</td>\n",
       "      <td>P</td>\n",
       "      <td>[moviemaking, is, a, lot, like, being, the, ge...</td>\n",
       "      <td>764</td>\n",
       "      <td>[moviemaking, lot, like, general, manager, nfl...</td>\n",
       "      <td>479</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>that's exactly how long the movie felt to me ....</td>\n",
       "      <td>N</td>\n",
       "      <td>[that, 's, exactly, how, long, the, movie, fel...</td>\n",
       "      <td>689</td>\n",
       "      <td>['s, exactly, long, movie, felt, ., n't, even,...</td>\n",
       "      <td>447</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>\" quest for camelot \" is warner bros . ' firs...</td>\n",
       "      <td>N</td>\n",
       "      <td>[``, quest, for, camelot, ``, is, warner, bros...</td>\n",
       "      <td>574</td>\n",
       "      <td>[``, quest, camelot, ``, warner, bros, ., ', f...</td>\n",
       "      <td>377</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>so ask yourself what \" 8mm \" ( \" eight millime...</td>\n",
       "      <td>N</td>\n",
       "      <td>[so, ask, yourself, what, ``, 8mm, ``, (, ``, ...</td>\n",
       "      <td>656</td>\n",
       "      <td>[ask, ``, 8mm, ``, (, ``, eight, millimeter, `...</td>\n",
       "      <td>412</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>synopsis : a mentally unstable man undergoing ...</td>\n",
       "      <td>N</td>\n",
       "      <td>[synopsis, :, a, mentally, unstable, man, unde...</td>\n",
       "      <td>855</td>\n",
       "      <td>[synopsis, :, mentally, unstable, man, undergo...</td>\n",
       "      <td>520</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>capsule : in 2176 on the planet mars police ta...</td>\n",
       "      <td>N</td>\n",
       "      <td>[capsule, :, in, 2176, on, the, planet, mars, ...</td>\n",
       "      <td>748</td>\n",
       "      <td>[capsule, :, 2176, planet, mars, police, takin...</td>\n",
       "      <td>454</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 PoN  \\\n",
       "0  films adapted from comic books have had plenty...   P   \n",
       "1  you've got mail works alot better than it dese...   P   \n",
       "2   \" jaws \" is a rare film that grabs your atten...   P   \n",
       "3  every now and then a movie comes along from a ...   P   \n",
       "4  moviemaking is a lot like being the general ma...   P   \n",
       "0  that's exactly how long the movie felt to me ....   N   \n",
       "1   \" quest for camelot \" is warner bros . ' firs...   N   \n",
       "2  so ask yourself what \" 8mm \" ( \" eight millime...   N   \n",
       "3  synopsis : a mentally unstable man undergoing ...   N   \n",
       "4  capsule : in 2176 on the planet mars police ta...   N   \n",
       "\n",
       "                                           tokenized  tokenized_count  \\\n",
       "0  [films, adapted, from, comic, books, have, had...              826   \n",
       "1  [you, 've, got, mail, works, alot, better, tha...              476   \n",
       "2  [``, jaws, ``, is, a, rare, film, that, grabs,...             1197   \n",
       "3  [every, now, and, then, a, movie, comes, along...              786   \n",
       "4  [moviemaking, is, a, lot, like, being, the, ge...              764   \n",
       "0  [that, 's, exactly, how, long, the, movie, fel...              689   \n",
       "1  [``, quest, for, camelot, ``, is, warner, bros...              574   \n",
       "2  [so, ask, yourself, what, ``, 8mm, ``, (, ``, ...              656   \n",
       "3  [synopsis, :, a, mentally, unstable, man, unde...              855   \n",
       "4  [capsule, :, in, 2176, on, the, planet, mars, ...              748   \n",
       "\n",
       "                                        no_stopwords  no_stopwords_count  \n",
       "0  [films, adapted, comic, books, plenty, success...                 540  \n",
       "1  ['ve, got, mail, works, alot, better, deserves...                 267  \n",
       "2  [``, jaws, ``, rare, film, grabs, attention, s...                 756  \n",
       "3  [every, movie, comes, along, suspect, studio, ...                 484  \n",
       "4  [moviemaking, lot, like, general, manager, nfl...                 479  \n",
       "0  ['s, exactly, long, movie, felt, ., n't, even,...                 447  \n",
       "1  [``, quest, camelot, ``, warner, bros, ., ', f...                 377  \n",
       "2  [ask, ``, 8mm, ``, (, ``, eight, millimeter, `...                 412  \n",
       "3  [synopsis, :, mentally, unstable, man, undergo...                 520  \n",
       "4  [capsule, :, 2176, planet, mars, police, takin...                 454  "
      ]
     },
     "execution_count": 78,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
