{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# SENTIMENT ANALYSIS (PANDAS STYLE!)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 1: Import ALL the things!\n", "#### Libraries and paths and files\n", "I'm sure there is a cleaner way to do this, plz lmk [via email](mailto:danielcaraway42@gmail.com)" ] }, { "cell_type": "code", "execution_count": 181, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pandas as pd\n", "negative = os.listdir('NEG/')\n", "positive = os.listdir('POS/')" ] }, { "cell_type": "code", "execution_count": 189, "metadata": {}, "outputs": [], "source": [ "positive_alltext = []\n", "for file in positive:\n", " f=open('POS/'+file)\n", " content=f.read()\n", " positive_alltext.append(content)\n", " f.close()\n", "\n", "negative_alltext = []\n", "for file in negative:\n", " f=open('NEG/'+file)\n", " content=f.read()\n", " negative_alltext.append(content)\n", " f.close()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 2: Turn that fresh text into a pandas DF and add a column to mark it as either positive or negative" ] }, { "cell_type": "code", "execution_count": 183, "metadata": {}, "outputs": [], "source": [ "positive_df = pd.DataFrame(positive_alltext)\n", "negative_df = pd.DataFrame(negative_alltext)" ] }, { "cell_type": "code", "execution_count": 184, "metadata": {}, "outputs": [], "source": [ "positive_df['PoN'] = 'P'\n", "negative_df['PoN'] = 'N'" ] }, { "cell_type": "code", "execution_count": 185, "metadata": {}, "outputs": [], "source": [ "# Combine the pos and neg dfs\n", "all_df = positive_df.append(negative_df)" ] }, { "cell_type": "code", "execution_count": 186, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoN
0films adapted from comic books have had plenty...P
1you've got mail works alot better than it dese...P
2\" jaws \" is a rare film that grabs your atten...P
3every now and then a movie comes along from a ...P
4moviemaking is a lot like being the general ma...P
0that's exactly how long the movie felt to me ....N
1\" quest for camelot \" is warner bros . ' firs...N
2so ask yourself what \" 8mm \" ( \" eight millime...N
3synopsis : a mentally unstable man undergoing ...N
4capsule : in 2176 on the planet mars police ta...N
\n", "
" ], "text/plain": [ " 0 PoN\n", "0 films adapted from comic books have had plenty... P\n", "1 you've got mail works alot better than it dese... P\n", "2 \" jaws \" is a rare film that grabs your atten... P\n", "3 every now and then a movie comes along from a ... P\n", "4 moviemaking is a lot like being the general ma... P\n", "0 that's exactly how long the movie felt to me .... N\n", "1 \" quest for camelot \" is warner bros . ' firs... N\n", "2 so ask yourself what \" 8mm \" ( \" eight millime... N\n", "3 synopsis : a mentally unstable man undergoing ... N\n", "4 capsule : in 2176 on the planet mars police ta... N" ] }, "execution_count": 186, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Our results!\n", "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 3: TOKENIZE (and clean)!!" ] }, { "cell_type": "code", "execution_count": 187, "metadata": {}, "outputs": [], "source": [ "''' \n", "clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", "IN ENGLISH: for every word in this set of words lower case the word if it is \"is alpha\"\n", "\"isalpha()\" meaning \"not a number or punctuation\"\n", "'''\n", "\n", "from nltk.tokenize import word_tokenize\n", "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "all_df['tokenized'] = all_df.apply(lambda x: get_tokens(x[0]),axis=1)\n", "all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 4: Remove Stopwords" ] }, { "cell_type": "code", "execution_count": 172, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words(\"english\"))\n", "def remove_stopwords(sentence):\n", " filtered_text = []\n", " for word in sentence:\n", " if word not in stop_words:\n", " filtered_text.append(word)\n", " return filtered_text\n", "all_df['no_stopwords'] = all_df.apply(lambda x: remove_stopwords(x['tokenized']),axis=1)\n", "all_df['no_stopwords_count'] = all_df.apply(lambda x: len(x['no_stopwords']),axis=1)" ] }, { "cell_type": "code", "execution_count": 173, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNtokenizedtokenized_countno_stopwordsno_stopwords_count
0films adapted from comic books have had plenty...P[films, adapted, from, comic, books, have, had...673[films, adapted, comic, books, plenty, success...387
1you've got mail works alot better than it dese...P[you, got, mail, works, alot, better, than, it...412[got, mail, works, alot, better, deserves, ord...203
2\" jaws \" is a rare film that grabs your atten...P[jaws, is, a, rare, film, that, grabs, your, a...993[jaws, rare, film, grabs, attention, shows, si...552
3every now and then a movie comes along from a ...P[every, now, and, then, a, movie, comes, along...628[every, movie, comes, along, suspect, studio, ...326
4moviemaking is a lot like being the general ma...P[moviemaking, is, a, lot, like, being, the, ge...630[moviemaking, lot, like, general, manager, nfl...345
0that's exactly how long the movie felt to me ....N[that, exactly, how, long, the, movie, felt, t...550[exactly, long, movie, felt, even, nine, laugh...308
1\" quest for camelot \" is warner bros . ' firs...N[quest, for, camelot, is, warner, bros, first,...444[quest, camelot, warner, bros, first, attempt,...247
2so ask yourself what \" 8mm \" ( \" eight millime...N[so, ask, yourself, what, eight, millimeter, i...527[ask, eight, millimeter, really, wholesome, su...283
3synopsis : a mentally unstable man undergoing ...N[synopsis, a, mentally, unstable, man, undergo...706[synopsis, mentally, unstable, man, undergoing...371
4capsule : in 2176 on the planet mars police ta...N[capsule, in, on, the, planet, mars, police, t...649[capsule, planet, mars, police, taking, custod...355
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 films adapted from comic books have had plenty... P \n", "1 you've got mail works alot better than it dese... P \n", "2 \" jaws \" is a rare film that grabs your atten... P \n", "3 every now and then a movie comes along from a ... P \n", "4 moviemaking is a lot like being the general ma... P \n", "0 that's exactly how long the movie felt to me .... N \n", "1 \" quest for camelot \" is warner bros . ' firs... N \n", "2 so ask yourself what \" 8mm \" ( \" eight millime... N \n", "3 synopsis : a mentally unstable man undergoing ... N \n", "4 capsule : in 2176 on the planet mars police ta... N \n", "\n", " tokenized tokenized_count \\\n", "0 [films, adapted, from, comic, books, have, had... 673 \n", "1 [you, got, mail, works, alot, better, than, it... 412 \n", "2 [jaws, is, a, rare, film, that, grabs, your, a... 993 \n", "3 [every, now, and, then, a, movie, comes, along... 628 \n", "4 [moviemaking, is, a, lot, like, being, the, ge... 630 \n", "0 [that, exactly, how, long, the, movie, felt, t... 550 \n", "1 [quest, for, camelot, is, warner, bros, first,... 444 \n", "2 [so, ask, yourself, what, eight, millimeter, i... 527 \n", "3 [synopsis, a, mentally, unstable, man, undergo... 706 \n", "4 [capsule, in, on, the, planet, mars, police, t... 649 \n", "\n", " no_stopwords no_stopwords_count \n", "0 [films, adapted, comic, books, plenty, success... 387 \n", "1 [got, mail, works, alot, better, deserves, ord... 203 \n", "2 [jaws, rare, film, grabs, attention, shows, si... 552 \n", "3 [every, movie, comes, along, suspect, studio, ... 326 \n", "4 [moviemaking, lot, like, general, manager, nfl... 345 \n", "0 [exactly, long, movie, felt, even, nine, laugh... 308 \n", "1 [quest, camelot, warner, bros, first, attempt,... 247 \n", "2 [ask, eight, millimeter, really, wholesome, su... 283 \n", "3 [synopsis, mentally, unstable, man, undergoing... 371 \n", "4 [capsule, planet, mars, police, taking, custod... 355 " ] }, "execution_count": 173, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 5: Create a Frequency Distribution" ] }, { "cell_type": "code", "execution_count": 174, "metadata": {}, "outputs": [], "source": [ "from nltk.probability import FreqDist\n", "def get_most_common(tokens):\n", " fdist = FreqDist(tokens)\n", " return fdist.most_common(1)\n", "all_df['most_common_unfiltered_word'] = all_df.apply(lambda x: get_most_common(x['tokenized']),axis=1)" ] }, { "cell_type": "code", "execution_count": 175, "metadata": {}, "outputs": [], "source": [ "from nltk.probability import FreqDist\n", "def get_most_common(tokens):\n", " fdist = FreqDist(tokens)\n", " return fdist.most_common(5)\n", "all_df['most_common_filtered_word'] = all_df.apply(lambda x: get_most_common(x['no_stopwords']),axis=1)" ] }, { "cell_type": "code", "execution_count": 176, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNtokenizedtokenized_countno_stopwordsno_stopwords_countmost_common_unfiltered_wordmost_common_filtered_word
0films adapted from comic books have had plenty...P[films, adapted, from, comic, books, have, had...673[films, adapted, comic, books, plenty, success...387[(the, 46)][(comic, 5), (hell, 5), (film, 5), (like, 4), ...
1you've got mail works alot better than it dese...P[you, got, mail, works, alot, better, than, it...412[got, mail, works, alot, better, deserves, ord...203[(the, 33)][(two, 3), (shop, 3), (much, 3), (fox, 3), (go...
2\" jaws \" is a rare film that grabs your atten...P[jaws, is, a, rare, film, that, grabs, your, a...993[jaws, rare, film, grabs, attention, shows, si...552[(the, 63)][(shark, 16), (jaws, 8), (film, 7), (spielberg...
3every now and then a movie comes along from a ...P[every, now, and, then, a, movie, comes, along...628[every, movie, comes, along, suspect, studio, ...326[(the, 35)][(even, 6), (gets, 6), (film, 5), (school, 5),...
4moviemaking is a lot like being the general ma...P[moviemaking, is, a, lot, like, being, the, ge...630[moviemaking, lot, like, general, manager, nfl...345[(the, 41)][(jackie, 10), (like, 9), (chan, 8), (got, 4),...
0that's exactly how long the movie felt to me ....N[that, exactly, how, long, the, movie, felt, t...550[exactly, long, movie, felt, even, nine, laugh...308[(the, 31)][(grant, 12), (movie, 7), (nine, 5), (hugh, 5)...
1\" quest for camelot \" is warner bros . ' firs...N[quest, for, camelot, is, warner, bros, first,...444[quest, camelot, warner, bros, first, attempt,...247[(the, 21)][(quest, 5), (camelot, 4), (kayley, 4), (disne...
2so ask yourself what \" 8mm \" ( \" eight millime...N[so, ask, yourself, what, eight, millimeter, i...527[ask, eight, millimeter, really, wholesome, su...283[(of, 21)][(like, 4), (schumacher, 4), (film, 4), (welle...
3synopsis : a mentally unstable man undergoing ...N[synopsis, a, mentally, unstable, man, undergo...706[synopsis, mentally, unstable, man, undergoing...371[(the, 48)][(stalked, 12), (daryl, 7), (stalker, 6), (bro...
4capsule : in 2176 on the planet mars police ta...N[capsule, in, on, the, planet, mars, police, t...649[capsule, planet, mars, police, taking, custod...355[(the, 30)][(mars, 14), (ghosts, 10), (carpenter, 8), (fi...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 films adapted from comic books have had plenty... P \n", "1 you've got mail works alot better than it dese... P \n", "2 \" jaws \" is a rare film that grabs your atten... P \n", "3 every now and then a movie comes along from a ... P \n", "4 moviemaking is a lot like being the general ma... P \n", "0 that's exactly how long the movie felt to me .... N \n", "1 \" quest for camelot \" is warner bros . ' firs... N \n", "2 so ask yourself what \" 8mm \" ( \" eight millime... N \n", "3 synopsis : a mentally unstable man undergoing ... N \n", "4 capsule : in 2176 on the planet mars police ta... N \n", "\n", " tokenized tokenized_count \\\n", "0 [films, adapted, from, comic, books, have, had... 673 \n", "1 [you, got, mail, works, alot, better, than, it... 412 \n", "2 [jaws, is, a, rare, film, that, grabs, your, a... 993 \n", "3 [every, now, and, then, a, movie, comes, along... 628 \n", "4 [moviemaking, is, a, lot, like, being, the, ge... 630 \n", "0 [that, exactly, how, long, the, movie, felt, t... 550 \n", "1 [quest, for, camelot, is, warner, bros, first,... 444 \n", "2 [so, ask, yourself, what, eight, millimeter, i... 527 \n", "3 [synopsis, a, mentally, unstable, man, undergo... 706 \n", "4 [capsule, in, on, the, planet, mars, police, t... 649 \n", "\n", " no_stopwords no_stopwords_count \\\n", "0 [films, adapted, comic, books, plenty, success... 387 \n", "1 [got, mail, works, alot, better, deserves, ord... 203 \n", "2 [jaws, rare, film, grabs, attention, shows, si... 552 \n", "3 [every, movie, comes, along, suspect, studio, ... 326 \n", "4 [moviemaking, lot, like, general, manager, nfl... 345 \n", "0 [exactly, long, movie, felt, even, nine, laugh... 308 \n", "1 [quest, camelot, warner, bros, first, attempt,... 247 \n", "2 [ask, eight, millimeter, really, wholesome, su... 283 \n", "3 [synopsis, mentally, unstable, man, undergoing... 371 \n", "4 [capsule, planet, mars, police, taking, custod... 355 \n", "\n", " most_common_unfiltered_word \\\n", "0 [(the, 46)] \n", "1 [(the, 33)] \n", "2 [(the, 63)] \n", "3 [(the, 35)] \n", "4 [(the, 41)] \n", "0 [(the, 31)] \n", "1 [(the, 21)] \n", "2 [(of, 21)] \n", "3 [(the, 48)] \n", "4 [(the, 30)] \n", "\n", " most_common_filtered_word \n", "0 [(comic, 5), (hell, 5), (film, 5), (like, 4), ... \n", "1 [(two, 3), (shop, 3), (much, 3), (fox, 3), (go... \n", "2 [(shark, 16), (jaws, 8), (film, 7), (spielberg... \n", "3 [(even, 6), (gets, 6), (film, 5), (school, 5),... \n", "4 [(jackie, 10), (like, 9), (chan, 8), (got, 4),... \n", "0 [(grant, 12), (movie, 7), (nine, 5), (hugh, 5)... \n", "1 [(quest, 5), (camelot, 4), (kayley, 4), (disne... \n", "2 [(like, 4), (schumacher, 4), (film, 4), (welle... \n", "3 [(stalked, 12), (daryl, 7), (stalker, 6), (bro... \n", "4 [(mars, 14), (ghosts, 10), (carpenter, 8), (fi... " ] }, "execution_count": 176, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }