{ "cells": [ { "cell_type": "code", "execution_count": 69, "metadata": {}, "outputs": [], "source": [ "import os\n", "negative = os.listdir('NEG/')\n", "positive = os.listdir('POS/')" ] }, { "cell_type": "code", "execution_count": 70, "metadata": {}, "outputs": [], "source": [ "positive_alltext = []\n", "for file in positive:\n", " f=open('POS/'+file)\n", " content=f.read()\n", " positive_alltext.append(content)\n", "negative_alltext = []\n", "for file in negative:\n", " f=open('NEG/'+file)\n", " content=f.read()\n", " negative_alltext.append(content)" ] }, { "cell_type": "code", "execution_count": 71, "metadata": {}, "outputs": [], "source": [ "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "positive_df = pd.DataFrame(positive_alltext)\n", "negative_df = pd.DataFrame(negative_alltext)" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [], "source": [ "positive_df['PoN'] = 'P'\n", "negative_df['PoN'] = 'N'" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "all_df = positive_df.append(negative_df)" ] }, { "cell_type": "code", "execution_count": 75, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoN
0films adapted from comic books have had plenty...P
1you've got mail works alot better than it dese...P
2\" jaws \" is a rare film that grabs your atten...P
3every now and then a movie comes along from a ...P
4moviemaking is a lot like being the general ma...P
0that's exactly how long the movie felt to me ....N
1\" quest for camelot \" is warner bros . ' firs...N
2so ask yourself what \" 8mm \" ( \" eight millime...N
3synopsis : a mentally unstable man undergoing ...N
4capsule : in 2176 on the planet mars police ta...N
\n", "
" ], "text/plain": [ " 0 PoN\n", "0 films adapted from comic books have had plenty... P\n", "1 you've got mail works alot better than it dese... P\n", "2 \" jaws \" is a rare film that grabs your atten... P\n", "3 every now and then a movie comes along from a ... P\n", "4 moviemaking is a lot like being the general ma... P\n", "0 that's exactly how long the movie felt to me .... N\n", "1 \" quest for camelot \" is warner bros . ' firs... N\n", "2 so ask yourself what \" 8mm \" ( \" eight millime... N\n", "3 synopsis : a mentally unstable man undergoing ... N\n", "4 capsule : in 2176 on the planet mars police ta... N" ] }, "execution_count": 75, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize\n", "def tokenizer(sentence):\n", " return word_tokenize(sentence)\n", "\n", "all_df['tokenized'] = all_df.apply(lambda x: tokenizer(x[0]),axis=1)\n", "all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words(\"english\"))\n", "def stopword_remover(sentence):\n", " filtered_text = []\n", " for word in sentence:\n", " if word not in stop_words:\n", " filtered_text.append(word)\n", " return filtered_text\n", "all_df['no_stopwords'] = all_df.apply(lambda x: stopword_remover(x['tokenized']),axis=1)\n", "all_df['no_stopwords_count'] = all_df.apply(lambda x: len(x['no_stopwords']),axis=1)" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNtokenizedtokenized_countno_stopwordsno_stopwords_count
0films adapted from comic books have had plenty...P[films, adapted, from, comic, books, have, had...826[films, adapted, comic, books, plenty, success...540
1you've got mail works alot better than it dese...P[you, 've, got, mail, works, alot, better, tha...476['ve, got, mail, works, alot, better, deserves...267
2\" jaws \" is a rare film that grabs your atten...P[``, jaws, ``, is, a, rare, film, that, grabs,...1197[``, jaws, ``, rare, film, grabs, attention, s...756
3every now and then a movie comes along from a ...P[every, now, and, then, a, movie, comes, along...786[every, movie, comes, along, suspect, studio, ...484
4moviemaking is a lot like being the general ma...P[moviemaking, is, a, lot, like, being, the, ge...764[moviemaking, lot, like, general, manager, nfl...479
0that's exactly how long the movie felt to me ....N[that, 's, exactly, how, long, the, movie, fel...689['s, exactly, long, movie, felt, ., n't, even,...447
1\" quest for camelot \" is warner bros . ' firs...N[``, quest, for, camelot, ``, is, warner, bros...574[``, quest, camelot, ``, warner, bros, ., ', f...377
2so ask yourself what \" 8mm \" ( \" eight millime...N[so, ask, yourself, what, ``, 8mm, ``, (, ``, ...656[ask, ``, 8mm, ``, (, ``, eight, millimeter, `...412
3synopsis : a mentally unstable man undergoing ...N[synopsis, :, a, mentally, unstable, man, unde...855[synopsis, :, mentally, unstable, man, undergo...520
4capsule : in 2176 on the planet mars police ta...N[capsule, :, in, 2176, on, the, planet, mars, ...748[capsule, :, 2176, planet, mars, police, takin...454
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 films adapted from comic books have had plenty... P \n", "1 you've got mail works alot better than it dese... P \n", "2 \" jaws \" is a rare film that grabs your atten... P \n", "3 every now and then a movie comes along from a ... P \n", "4 moviemaking is a lot like being the general ma... P \n", "0 that's exactly how long the movie felt to me .... N \n", "1 \" quest for camelot \" is warner bros . ' firs... N \n", "2 so ask yourself what \" 8mm \" ( \" eight millime... N \n", "3 synopsis : a mentally unstable man undergoing ... N \n", "4 capsule : in 2176 on the planet mars police ta... N \n", "\n", " tokenized tokenized_count \\\n", "0 [films, adapted, from, comic, books, have, had... 826 \n", "1 [you, 've, got, mail, works, alot, better, tha... 476 \n", "2 [``, jaws, ``, is, a, rare, film, that, grabs,... 1197 \n", "3 [every, now, and, then, a, movie, comes, along... 786 \n", "4 [moviemaking, is, a, lot, like, being, the, ge... 764 \n", "0 [that, 's, exactly, how, long, the, movie, fel... 689 \n", "1 [``, quest, for, camelot, ``, is, warner, bros... 574 \n", "2 [so, ask, yourself, what, ``, 8mm, ``, (, ``, ... 656 \n", "3 [synopsis, :, a, mentally, unstable, man, unde... 855 \n", "4 [capsule, :, in, 2176, on, the, planet, mars, ... 748 \n", "\n", " no_stopwords no_stopwords_count \n", "0 [films, adapted, comic, books, plenty, success... 540 \n", "1 ['ve, got, mail, works, alot, better, deserves... 267 \n", "2 [``, jaws, ``, rare, film, grabs, attention, s... 756 \n", "3 [every, movie, comes, along, suspect, studio, ... 484 \n", "4 [moviemaking, lot, like, general, manager, nfl... 479 \n", "0 ['s, exactly, long, movie, felt, ., n't, even,... 447 \n", "1 [``, quest, camelot, ``, warner, bros, ., ', f... 377 \n", "2 [ask, ``, 8mm, ``, (, ``, eight, millimeter, `... 412 \n", "3 [synopsis, :, mentally, unstable, man, undergo... 520 \n", "4 [capsule, :, 2176, planet, mars, police, takin... 454 " ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }