{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW2: VECTORIZATION (Pandas style!)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 1: Import ALL the things\n", "### Import libraries " ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "##########################################\n", "# NOTE: I'm toying with the idea of requiring the library just above \n", "# when I use it so it makes more sense in context\n", "##########################################\n", "# import os\n", "# import pandas as pd\n", "# from nltk.tokenize import word_tokenize, sent_tokenize\n", "# from nltk.sentiment import SentimentAnalyzer\n", "# from nltk.sentiment.util import *\n", "# from nltk.probability import FreqDist\n", "# from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", "# sid = SentimentIntensityAnalyzer()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Import data from files" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "def get_data_from_files(path):\n", " directory = os.listdir(path)\n", " results = []\n", " for file in directory:\n", " f=open(path+file)\n", " results.append(f.read())\n", " f.close()\n", " return results\n", "\n", "neg = get_data_from_files('../neg_cornell/')\n", "pos = get_data_from_files('../pos_cornell/')\n", "\n", "# neg = get_data_from_files('../neg_hw4/')\n", "# pos = get_data_from_files('../pos_hw4/')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 2: Prep Data\n", "### STEP 2a: Turn that fresh text into a pandas DF" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "neg_df = pd.DataFrame(neg)\n", "pos_df = pd.DataFrame(pos)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 2b: Label it" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "pos_df['PoN'] = 'P'\n", "neg_df['PoN'] = 'N'" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### STEP 2c: Combine the dfs" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "all_df = neg_df.append(pos_df)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoN
0bad . bad . \\nbad . \\nthat one word seems to p...N
1isn't it the ultimate sign of a movie's cinema...N
2\" gordy \" is not a movie , it is a 90-minute-...N
3disconnect the phone line . \\ndon't accept the...N
4when robert forster found himself famous again...N
.........
995one of the funniest carry on movies and the th...P
996i remember making a pact , right after `patch ...P
997barely scrapping by playing at a nyc piano bar...P
998if the current trends of hollywood filmmaking ...P
999capsule : the director of cure brings a weird ...P
\n", "

2000 rows × 2 columns

\n", "
" ], "text/plain": [ " 0 PoN\n", "0 bad . bad . \\nbad . \\nthat one word seems to p... N\n", "1 isn't it the ultimate sign of a movie's cinema... N\n", "2 \" gordy \" is not a movie , it is a 90-minute-... N\n", "3 disconnect the phone line . \\ndon't accept the... N\n", "4 when robert forster found himself famous again... N\n", ".. ... ..\n", "995 one of the funniest carry on movies and the th... P\n", "996 i remember making a pact , right after `patch ... P\n", "997 barely scrapping by playing at a nyc piano bar... P\n", "998 if the current trends of hollywood filmmaking ... P\n", "999 capsule : the director of cure brings a weird ... P\n", "\n", "[2000 rows x 2 columns]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 3: TOKENIZE (and clean)!!" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize, sent_tokenize\n", "from nltk.sentiment import SentimentAnalyzer\n", "from nltk.sentiment.util import *" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "## Came back and added sentences for tokinization for \"Summary experiment\"\n", "def get_sentence_tokens(review):\n", " return sent_tokenize(review)\n", " \n", "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n", "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n", "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokens
0bad . bad . \\nbad . \\nthat one word seems to p...N[bad ., bad ., bad ., that one word seems to p...67[bad, bad, bad, that, one, word, seems, to, pr...1071
1isn't it the ultimate sign of a movie's cinema...N[isn't it the ultimate sign of a movie's cinem...32[is, it, the, ultimate, sign, of, a, movie, ci...553
2\" gordy \" is not a movie , it is a 90-minute-...N[ \" gordy \" is not a movie , it is a 90-minute...23[gordy, is, not, a, movie, it, is, a, sesame, ...478
3disconnect the phone line . \\ndon't accept the...N[disconnect the phone line ., don't accept the...37[disconnect, the, phone, line, do, accept, the...604
4when robert forster found himself famous again...N[when robert forster found himself famous agai...29[when, robert, forster, found, himself, famous...386
.....................
995one of the funniest carry on movies and the th...P[one of the funniest carry on movies and the t...25[one, of, the, funniest, carry, on, movies, an...434
996i remember making a pact , right after `patch ...P[i remember making a pact , right after `patch...40[i, remember, making, a, pact, right, after, p...652
997barely scrapping by playing at a nyc piano bar...P[barely scrapping by playing at a nyc piano ba...23[barely, scrapping, by, playing, at, a, nyc, p...345
998if the current trends of hollywood filmmaking ...P[if the current trends of hollywood filmmaking...34[if, the, current, trends, of, hollywood, film...730
999capsule : the director of cure brings a weird ...P[capsule : the director of cure brings a weird...45[capsule, the, director, of, cure, brings, a, ...641
\n", "

2000 rows × 6 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 bad . bad . \\nbad . \\nthat one word seems to p... N \n", "1 isn't it the ultimate sign of a movie's cinema... N \n", "2 \" gordy \" is not a movie , it is a 90-minute-... N \n", "3 disconnect the phone line . \\ndon't accept the... N \n", "4 when robert forster found himself famous again... N \n", ".. ... .. \n", "995 one of the funniest carry on movies and the th... P \n", "996 i remember making a pact , right after `patch ... P \n", "997 barely scrapping by playing at a nyc piano bar... P \n", "998 if the current trends of hollywood filmmaking ... P \n", "999 capsule : the director of cure brings a weird ... P \n", "\n", " sentences num_sentences \\\n", "0 [bad ., bad ., bad ., that one word seems to p... 67 \n", "1 [isn't it the ultimate sign of a movie's cinem... 32 \n", "2 [ \" gordy \" is not a movie , it is a 90-minute... 23 \n", "3 [disconnect the phone line ., don't accept the... 37 \n", "4 [when robert forster found himself famous agai... 29 \n", ".. ... ... \n", "995 [one of the funniest carry on movies and the t... 25 \n", "996 [i remember making a pact , right after `patch... 40 \n", "997 [barely scrapping by playing at a nyc piano ba... 23 \n", "998 [if the current trends of hollywood filmmaking... 34 \n", "999 [capsule : the director of cure brings a weird... 45 \n", "\n", " tokens num_tokens \n", "0 [bad, bad, bad, that, one, word, seems, to, pr... 1071 \n", "1 [is, it, the, ultimate, sign, of, a, movie, ci... 553 \n", "2 [gordy, is, not, a, movie, it, is, a, sesame, ... 478 \n", "3 [disconnect, the, phone, line, do, accept, the... 604 \n", "4 [when, robert, forster, found, himself, famous... 386 \n", ".. ... ... \n", "995 [one, of, the, funniest, carry, on, movies, an... 434 \n", "996 [i, remember, making, a, pact, right, after, p... 652 \n", "997 [barely, scrapping, by, playing, at, a, nyc, p... 345 \n", "998 [if, the, current, trends, of, hollywood, film... 730 \n", "999 [capsule, the, director, of, cure, brings, a, ... 641 \n", "\n", "[2000 rows x 6 columns]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 4: Remove Stopwords" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words(\"english\"))\n", "def remove_stopwords(sentence):\n", " filtered_text = []\n", " for word in sentence:\n", " if word not in stop_words:\n", " filtered_text.append(word)\n", " return filtered_text\n", "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n", "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_sw
0bad . bad . \\nbad . \\nthat one word seems to p...N[bad ., bad ., bad ., that one word seems to p...67[bad, bad, bad, that, one, word, seems, to, pr...1071[bad, bad, bad, one, word, seems, pretty, much...515
1isn't it the ultimate sign of a movie's cinema...N[isn't it the ultimate sign of a movie's cinem...32[is, it, the, ultimate, sign, of, a, movie, ci...553[ultimate, sign, movie, cinematic, ineptitude,...297
2\" gordy \" is not a movie , it is a 90-minute-...N[ \" gordy \" is not a movie , it is a 90-minute...23[gordy, is, not, a, movie, it, is, a, sesame, ...478[gordy, movie, sesame, street, skit, bad, one,...239
3disconnect the phone line . \\ndon't accept the...N[disconnect the phone line ., don't accept the...37[disconnect, the, phone, line, do, accept, the...604[disconnect, phone, line, accept, charges, any...323
4when robert forster found himself famous again...N[when robert forster found himself famous agai...29[when, robert, forster, found, himself, famous...386[robert, forster, found, famous, appearing, ja...185
...........................
995one of the funniest carry on movies and the th...P[one of the funniest carry on movies and the t...25[one, of, the, funniest, carry, on, movies, an...434[one, funniest, carry, movies, third, medical,...241
996i remember making a pact , right after `patch ...P[i remember making a pact , right after `patch...40[i, remember, making, a, pact, right, after, p...652[remember, making, pact, right, patch, adams, ...361
997barely scrapping by playing at a nyc piano bar...P[barely scrapping by playing at a nyc piano ba...23[barely, scrapping, by, playing, at, a, nyc, p...345[barely, scrapping, playing, nyc, piano, bar, ...177
998if the current trends of hollywood filmmaking ...P[if the current trends of hollywood filmmaking...34[if, the, current, trends, of, hollywood, film...730[current, trends, hollywood, filmmaking, conti...428
999capsule : the director of cure brings a weird ...P[capsule : the director of cure brings a weird...45[capsule, the, director, of, cure, brings, a, ...641[capsule, director, cure, brings, weird, compl...340
\n", "

2000 rows × 8 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 bad . bad . \\nbad . \\nthat one word seems to p... N \n", "1 isn't it the ultimate sign of a movie's cinema... N \n", "2 \" gordy \" is not a movie , it is a 90-minute-... N \n", "3 disconnect the phone line . \\ndon't accept the... N \n", "4 when robert forster found himself famous again... N \n", ".. ... .. \n", "995 one of the funniest carry on movies and the th... P \n", "996 i remember making a pact , right after `patch ... P \n", "997 barely scrapping by playing at a nyc piano bar... P \n", "998 if the current trends of hollywood filmmaking ... P \n", "999 capsule : the director of cure brings a weird ... P \n", "\n", " sentences num_sentences \\\n", "0 [bad ., bad ., bad ., that one word seems to p... 67 \n", "1 [isn't it the ultimate sign of a movie's cinem... 32 \n", "2 [ \" gordy \" is not a movie , it is a 90-minute... 23 \n", "3 [disconnect the phone line ., don't accept the... 37 \n", "4 [when robert forster found himself famous agai... 29 \n", ".. ... ... \n", "995 [one of the funniest carry on movies and the t... 25 \n", "996 [i remember making a pact , right after `patch... 40 \n", "997 [barely scrapping by playing at a nyc piano ba... 23 \n", "998 [if the current trends of hollywood filmmaking... 34 \n", "999 [capsule : the director of cure brings a weird... 45 \n", "\n", " tokens num_tokens \\\n", "0 [bad, bad, bad, that, one, word, seems, to, pr... 1071 \n", "1 [is, it, the, ultimate, sign, of, a, movie, ci... 553 \n", "2 [gordy, is, not, a, movie, it, is, a, sesame, ... 478 \n", "3 [disconnect, the, phone, line, do, accept, the... 604 \n", "4 [when, robert, forster, found, himself, famous... 386 \n", ".. ... ... \n", "995 [one, of, the, funniest, carry, on, movies, an... 434 \n", "996 [i, remember, making, a, pact, right, after, p... 652 \n", "997 [barely, scrapping, by, playing, at, a, nyc, p... 345 \n", "998 [if, the, current, trends, of, hollywood, film... 730 \n", "999 [capsule, the, director, of, cure, brings, a, ... 641 \n", "\n", " no_sw num_no_sw \n", "0 [bad, bad, bad, one, word, seems, pretty, much... 515 \n", "1 [ultimate, sign, movie, cinematic, ineptitude,... 297 \n", "2 [gordy, movie, sesame, street, skit, bad, one,... 239 \n", "3 [disconnect, phone, line, accept, charges, any... 323 \n", "4 [robert, forster, found, famous, appearing, ja... 185 \n", ".. ... ... \n", "995 [one, funniest, carry, movies, third, medical,... 241 \n", "996 [remember, making, pact, right, patch, adams, ... 361 \n", "997 [barely, scrapping, playing, nyc, piano, bar, ... 177 \n", "998 [current, trends, hollywood, filmmaking, conti... 428 \n", "999 [capsule, director, cure, brings, weird, compl... 340 \n", "\n", "[2000 rows x 8 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 5: Create a Frequency Distribution" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "from nltk.probability import FreqDist\n", "def get_most_common(tokens):\n", " fdist = FreqDist(tokens)\n", " return fdist.most_common(12)\n", "all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "scrolled": true }, "outputs": [], "source": [ "def get_most_common(tokens):\n", " fdist = FreqDist(tokens)\n", " return fdist.most_common(12)\n", "all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "def get_fdist(tokens):\n", " return (FreqDist(tokens))\n", " \n", "all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)\n", "all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swtopwords_unfiltopwords_filfreq_distfreq_dist_unfil
0bad . bad . \\nbad . \\nthat one word seems to p...N[bad ., bad ., bad ., that one word seems to p...67[bad, bad, bad, that, one, word, seems, to, pr...1071[bad, bad, bad, one, word, seems, pretty, much...515[(the, 60), (a, 35), (to, 34), (of, 24), (this...[(movie, 17), (bad, 8), (one, 7), (meyer, 6), ...{'bad': 8, 'one': 7, 'word': 1, 'seems': 1, 'p...{'bad': 8, 'that': 19, 'one': 7, 'word': 1, 's...
1isn't it the ultimate sign of a movie's cinema...N[isn't it the ultimate sign of a movie's cinem...32[is, it, the, ultimate, sign, of, a, movie, ci...553[ultimate, sign, movie, cinematic, ineptitude,...297[(the, 28), (a, 18), (of, 16), (to, 14), (i, 1...[(movie, 7), (one, 6), (first, 5), (much, 4), ...{'ultimate': 1, 'sign': 1, 'movie': 7, 'cinema...{'is': 11, 'it': 11, 'the': 28, 'ultimate': 1,...
2\" gordy \" is not a movie , it is a 90-minute-...N[ \" gordy \" is not a movie , it is a 90-minute...23[gordy, is, not, a, movie, it, is, a, sesame, ...478[gordy, movie, sesame, street, skit, bad, one,...239[(the, 25), (and, 21), (to, 18), (is, 17), (a,...[(gordy, 8), (movie, 5), (one, 4), (stupid, 4)...{'gordy': 8, 'movie': 5, 'sesame': 1, 'street'...{'gordy': 8, 'is': 17, 'not': 3, 'a': 17, 'mov...
3disconnect the phone line . \\ndon't accept the...N[disconnect the phone line ., don't accept the...37[disconnect, the, phone, line, do, accept, the...604[disconnect, phone, line, accept, charges, any...323[(the, 41), (of, 17), (a, 17), (to, 16), (and,...[(hanging, 9), (sisters, 5), (ryan, 4), (time,...{'disconnect': 1, 'phone': 2, 'line': 1, 'acce...{'disconnect': 1, 'the': 41, 'phone': 2, 'line...
4when robert forster found himself famous again...N[when robert forster found himself famous agai...29[when, robert, forster, found, himself, famous...386[robert, forster, found, famous, appearing, ja...185[(the, 21), (it, 11), (i, 10), (to, 10), (of, ...[(film, 5), (movie, 5), (american, 4), (perfek...{'robert': 2, 'forster': 3, 'found': 1, 'famou...{'when': 2, 'robert': 2, 'forster': 3, 'found'...
.......................................
995one of the funniest carry on movies and the th...P[one of the funniest carry on movies and the t...25[one, of, the, funniest, carry, on, movies, an...434[one, funniest, carry, movies, third, medical,...241[(the, 26), (and, 21), (of, 11), (a, 10), (is,...[(nookey, 9), (hawtrey, 5), (carry, 4), (dr, 4...{'one': 1, 'funniest': 1, 'carry': 4, 'movies'...{'one': 1, 'of': 11, 'the': 26, 'funniest': 1,...
996i remember making a pact , right after `patch ...P[i remember making a pact , right after `patch...40[i, remember, making, a, pact, right, after, p...652[remember, making, pact, right, patch, adams, ...361[(the, 44), (of, 29), (and, 19), (a, 15), (it,...[(music, 8), (heart, 7), (craven, 6), (movie, ...{'remember': 1, 'making': 1, 'pact': 1, 'right...{'i': 1, 'remember': 1, 'making': 1, 'a': 15, ...
997barely scrapping by playing at a nyc piano bar...P[barely scrapping by playing at a nyc piano ba...23[barely, scrapping, by, playing, at, a, nyc, p...345[barely, scrapping, playing, nyc, piano, bar, ...177[(a, 23), (is, 16), (the, 13), (and, 10), (of,...[(like, 4), (hutton, 3), (old, 3), (high, 2), ...{'barely': 1, 'scrapping': 1, 'playing': 1, 'n...{'barely': 1, 'scrapping': 1, 'by': 2, 'playin...
998if the current trends of hollywood filmmaking ...P[if the current trends of hollywood filmmaking...34[if, the, current, trends, of, hollywood, film...730[current, trends, hollywood, filmmaking, conti...428[(the, 49), (of, 31), (and, 19), (in, 18), (to...[(one, 7), (like, 5), (l, 5), (hollywood, 4), ...{'current': 1, 'trends': 1, 'hollywood': 4, 'f...{'if': 1, 'the': 49, 'current': 1, 'trends': 1...
999capsule : the director of cure brings a weird ...P[capsule : the director of cure brings a weird...45[capsule, the, director, of, cure, brings, a, ...641[capsule, director, cure, brings, weird, compl...340[(the, 33), (to, 28), (and, 21), (a, 18), (of,...[(computer, 11), (kurosawa, 8), (one, 5), (see...{'capsule': 1, 'director': 1, 'cure': 3, 'brin...{'capsule': 1, 'the': 33, 'director': 1, 'of':...
\n", "

2000 rows × 12 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 bad . bad . \\nbad . \\nthat one word seems to p... N \n", "1 isn't it the ultimate sign of a movie's cinema... N \n", "2 \" gordy \" is not a movie , it is a 90-minute-... N \n", "3 disconnect the phone line . \\ndon't accept the... N \n", "4 when robert forster found himself famous again... N \n", ".. ... .. \n", "995 one of the funniest carry on movies and the th... P \n", "996 i remember making a pact , right after `patch ... P \n", "997 barely scrapping by playing at a nyc piano bar... P \n", "998 if the current trends of hollywood filmmaking ... P \n", "999 capsule : the director of cure brings a weird ... P \n", "\n", " sentences num_sentences \\\n", "0 [bad ., bad ., bad ., that one word seems to p... 67 \n", "1 [isn't it the ultimate sign of a movie's cinem... 32 \n", "2 [ \" gordy \" is not a movie , it is a 90-minute... 23 \n", "3 [disconnect the phone line ., don't accept the... 37 \n", "4 [when robert forster found himself famous agai... 29 \n", ".. ... ... \n", "995 [one of the funniest carry on movies and the t... 25 \n", "996 [i remember making a pact , right after `patch... 40 \n", "997 [barely scrapping by playing at a nyc piano ba... 23 \n", "998 [if the current trends of hollywood filmmaking... 34 \n", "999 [capsule : the director of cure brings a weird... 45 \n", "\n", " tokens num_tokens \\\n", "0 [bad, bad, bad, that, one, word, seems, to, pr... 1071 \n", "1 [is, it, the, ultimate, sign, of, a, movie, ci... 553 \n", "2 [gordy, is, not, a, movie, it, is, a, sesame, ... 478 \n", "3 [disconnect, the, phone, line, do, accept, the... 604 \n", "4 [when, robert, forster, found, himself, famous... 386 \n", ".. ... ... \n", "995 [one, of, the, funniest, carry, on, movies, an... 434 \n", "996 [i, remember, making, a, pact, right, after, p... 652 \n", "997 [barely, scrapping, by, playing, at, a, nyc, p... 345 \n", "998 [if, the, current, trends, of, hollywood, film... 730 \n", "999 [capsule, the, director, of, cure, brings, a, ... 641 \n", "\n", " no_sw num_no_sw \\\n", "0 [bad, bad, bad, one, word, seems, pretty, much... 515 \n", "1 [ultimate, sign, movie, cinematic, ineptitude,... 297 \n", "2 [gordy, movie, sesame, street, skit, bad, one,... 239 \n", "3 [disconnect, phone, line, accept, charges, any... 323 \n", "4 [robert, forster, found, famous, appearing, ja... 185 \n", ".. ... ... \n", "995 [one, funniest, carry, movies, third, medical,... 241 \n", "996 [remember, making, pact, right, patch, adams, ... 361 \n", "997 [barely, scrapping, playing, nyc, piano, bar, ... 177 \n", "998 [current, trends, hollywood, filmmaking, conti... 428 \n", "999 [capsule, director, cure, brings, weird, compl... 340 \n", "\n", " topwords_unfil \\\n", "0 [(the, 60), (a, 35), (to, 34), (of, 24), (this... \n", "1 [(the, 28), (a, 18), (of, 16), (to, 14), (i, 1... \n", "2 [(the, 25), (and, 21), (to, 18), (is, 17), (a,... \n", "3 [(the, 41), (of, 17), (a, 17), (to, 16), (and,... \n", "4 [(the, 21), (it, 11), (i, 10), (to, 10), (of, ... \n", ".. ... \n", "995 [(the, 26), (and, 21), (of, 11), (a, 10), (is,... \n", "996 [(the, 44), (of, 29), (and, 19), (a, 15), (it,... \n", "997 [(a, 23), (is, 16), (the, 13), (and, 10), (of,... \n", "998 [(the, 49), (of, 31), (and, 19), (in, 18), (to... \n", "999 [(the, 33), (to, 28), (and, 21), (a, 18), (of,... \n", "\n", " topwords_fil \\\n", "0 [(movie, 17), (bad, 8), (one, 7), (meyer, 6), ... \n", "1 [(movie, 7), (one, 6), (first, 5), (much, 4), ... \n", "2 [(gordy, 8), (movie, 5), (one, 4), (stupid, 4)... \n", "3 [(hanging, 9), (sisters, 5), (ryan, 4), (time,... \n", "4 [(film, 5), (movie, 5), (american, 4), (perfek... \n", ".. ... \n", "995 [(nookey, 9), (hawtrey, 5), (carry, 4), (dr, 4... \n", "996 [(music, 8), (heart, 7), (craven, 6), (movie, ... \n", "997 [(like, 4), (hutton, 3), (old, 3), (high, 2), ... \n", "998 [(one, 7), (like, 5), (l, 5), (hollywood, 4), ... \n", "999 [(computer, 11), (kurosawa, 8), (one, 5), (see... \n", "\n", " freq_dist \\\n", "0 {'bad': 8, 'one': 7, 'word': 1, 'seems': 1, 'p... \n", "1 {'ultimate': 1, 'sign': 1, 'movie': 7, 'cinema... \n", "2 {'gordy': 8, 'movie': 5, 'sesame': 1, 'street'... \n", "3 {'disconnect': 1, 'phone': 2, 'line': 1, 'acce... \n", "4 {'robert': 2, 'forster': 3, 'found': 1, 'famou... \n", ".. ... \n", "995 {'one': 1, 'funniest': 1, 'carry': 4, 'movies'... \n", "996 {'remember': 1, 'making': 1, 'pact': 1, 'right... \n", "997 {'barely': 1, 'scrapping': 1, 'playing': 1, 'n... \n", "998 {'current': 1, 'trends': 1, 'hollywood': 4, 'f... \n", "999 {'capsule': 1, 'director': 1, 'cure': 3, 'brin... \n", "\n", " freq_dist_unfil \n", "0 {'bad': 8, 'that': 19, 'one': 7, 'word': 1, 's... \n", "1 {'is': 11, 'it': 11, 'the': 28, 'ultimate': 1,... \n", "2 {'gordy': 8, 'is': 17, 'not': 3, 'a': 17, 'mov... \n", "3 {'disconnect': 1, 'the': 41, 'phone': 2, 'line... \n", "4 {'when': 2, 'robert': 2, 'forster': 3, 'found'... \n", ".. ... \n", "995 {'one': 1, 'of': 11, 'the': 26, 'funniest': 1,... \n", "996 {'i': 1, 'remember': 1, 'making': 1, 'a': 15, ... \n", "997 {'barely': 1, 'scrapping': 1, 'by': 2, 'playin... \n", "998 {'if': 1, 'the': 49, 'current': 1, 'trends': 1... \n", "999 {'capsule': 1, 'the': 33, 'director': 1, 'of':... \n", "\n", "[2000 rows x 12 columns]" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 6: Try Different Sentiment Analysis Tools" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### VADER" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", "sid = SentimentIntensityAnalyzer()\n", "def get_vader_score(review):\n", " return sid.polarity_scores(review)\n", "\n", "all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def separate_vader_score(vader_score, key):\n", " return vader_score[key]\n", "\n", "all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)\n", "all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)\n", "all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)\n", "all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### DIY SUMMARY" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "17 about an hour or so into \" the jackal , \" a ch...\n", "17 meet joe black ( reviewed on nov . 27/98 ) \\ns...\n", "Name: 0, dtype: object" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[0][17]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "def get_weighted_freq_dist(review, freq_dist):\n", " try:\n", " max_freq = max(freq_dist.values())\n", " for word in freq_dist.keys():\n", " freq_dist[word] = (freq_dist[word]/max_freq)\n", " return freq_dist\n", " except:\n", " return 'nope'\n", "\n", "all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "def get_sentence_score(review, freq_dist):\n", " sentence_scores = {}\n", " for sent in review:\n", " for word in nltk.word_tokenize(sent.lower()):\n", " if word in freq_dist.keys():\n", " if len(sent.split(' ')) < 30:\n", " if sent not in sentence_scores.keys():\n", " sentence_scores[sent] = freq_dist[word]\n", " else:\n", " sentence_scores[sent] += freq_dist[word]\n", " return sentence_scores\n", "\n", "all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def get_summary_sentences(sentence_scores):\n", " sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)\n", " return ''.join(sent[0] for sent in sorted_sentences[:5])\n", "\n", "all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "summaries = all_df['summary_sentences'].tolist()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"cell-phones ring every five minutes , and everyone hurriedly rushes along , leaving marginal time for the frustrated viewer to relate to the sisters' issues and problems .i figured i needed to get in touch with my feminine side , and `hanging up' seemed like an ideal opportunity to do so .ryan's convincing performance and diverting cuteness are two of the more agreeable aspects of `hanging up' .it's certainly a far cry from what one would label as a rewarding experience , but `hanging up' should have at least been enjoyable .maddy ( kudrow ) , the soap opera actress , spends time either contemplating her possible path to stardom or nursing her dog .\"" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summaries[3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Doing VADER on the Summary Section" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)\n", "all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)\n", "all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)\n", "all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Doing VADER on the Most Frequent Words" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def get_freq_words(freq_dist):\n", " sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)\n", " return ' '.join(word[0] for word in sorted_words[:50])\n", "\n", "all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)\n", "\n", "all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)\n", "all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)\n", "all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)\n", "all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)\n", "all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 7: Test `Step 6` with Machine Learning!!\n", "### Naive Bayes" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import GaussianNB\n", "\n", "def get_NB(small_df, labels):\n", " x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n", "\n", " gnb = GaussianNB()\n", " gnb.fit(x_train, y_train)\n", " y_pred = gnb.predict(x_test)\n", " from sklearn import metrics\n", " print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 1: Vader Scores (Original)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.645\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu']) # 0.645\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 2: Vader Scores (from Summary)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.59\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) # 0.59\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 3: Vader Scores (original) AND Vader Scores (summary)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6183333333333333\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n", " 'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 4: Vader Scores (50 most frequent -- filtered -- words)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.5983333333333334\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd']) # 0.598\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 5: All `compound` Vader Scores" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.615\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_fd','v_compound_sum', 'v_compound']) # 0.615\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 6: ALL THE NUMBERS!!" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6133333333333333\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n", " 'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', \n", " 'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.613\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### TEST 7: Test UNFILTERED most frequent words" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "def get_freq_words(freq_dist):\n", " sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)\n", " return ' '.join(word[0] for word in sorted_words[:50])\n", "\n", "all_df['v_freq_words_unfil'] = all_df.apply(lambda x: get_freq_words(x['freq_dist_unfil']), axis=1)\n", "\n", "all_df['vader_fd_all_unfil'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words_unfil']),axis=1)\n", "\n", "all_df['v_compound_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'compound'),axis=1)\n", "all_df['v_neg_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neg'),axis=1)\n", "all_df['v_neu_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'neu'),axis=1)\n", "all_df['v_pos_fd_uf'] = all_df.apply(lambda x: separate_vader_score(x['vader_fd_all_unfil'], 'pos'),axis=1)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.62\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n", " 'v_compound_fd','v_pos_fd', 'v_neg_fd', 'v_neu_fd', \n", " 'v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf',\n", " 'v_compound','v_pos', 'v_neg', 'v_neu']) # 0.618\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6033333333333334\n" ] } ], "source": [ "small_df = all_df.filter(['v_compound_fd_uf','v_pos_fd_uf', 'v_neg_fd_uf', 'v_neu_fd_uf']) # 0.603\n", "get_NB(small_df, all_df['PoN'])" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [], "source": [ "summaries_pos = all_df[all_df['PoN'] == 'P']\n", "summaries_neg = all_df[all_df['PoN'] == 'N']" ] }, { "cell_type": "code", "execution_count": 38, "metadata": {}, "outputs": [], "source": [ "summaries_pos_list = summaries_pos['summary_sentences'].tolist()\n", "summaries_neg_list = summaries_neg['summary_sentences'].tolist()" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['charles walks in on amy and oscar having a drink one night , as oscar and amy have become great friends , but he doesn\\'t seem to mind .neve is delightful as her conflicted character , who feels love for oscar , but knows , based on rumors , that he is gay .the bottom line : three to tango is a light , sharp , snappy romantic comedy with a superb ending , and great stars .well , another popular phrase of the 90\\'s is \" all good things must come to an end , \" and this stays true for oscar as well .oscar gladly takes the job , and meets amy at an art show of hers , and sparks fly between the two from the get go .']" ] }, "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summaries_pos_list[:1]" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[\"but the wretched dialogue goes along well with the wretched quality of everything else in this movie .i don't know , but all the big words in the world wouldn't be able to disguise the bad writing and even worse acting .hey , it a sexist movie , so i'm writing a sexist review .this goes along with the rest of the idiotic thinking in the movie .there are a couple of other idiotic subplots thrown in for good measure , but the fame is the one that pretty much sums up this thing .\"]" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summaries_neg_list[:1]" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[\"but the wretched dialogue goes along well with the wretched quality of everything else in this movie .i don't know , but all the big words in the world wouldn't be able to disguise the bad writing and even worse acting .hey , it a sexist movie , so i'm writing a sexist review .this goes along with the rest of the idiotic thinking in the movie .there are a couple of other idiotic subplots thrown in for good measure , but the fame is the one that pretty much sums up this thing .\"]" ] }, "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ "summaries_neg_list[:1]" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "### VERSION 1\n", "# all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])\n", "# unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)\n", "# sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)\n", "# training_set = sentim_analyzer.apply_features(training_docs)\n", "# test_set = sentim_analyzer.apply_features(testing_docs)\n", "sentim_analyzer = SentimentAnalyzer()\n", "\n", "def get_nltk_negs(tokens):\n", " all_words_neg = sentim_analyzer.all_words([mark_negation(tokens)])\n", " return all_words_neg\n", "\n", "def get_unigram_feats(neg_tokens):\n", " unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)\n", " return unigram_feats\n", " \n", "all_df['nltk_negs'] = all_df.apply(lambda x: get_nltk_negs(x['tokens']), axis=1)\n", "all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)\n", "# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "### VERSION 2\n", "# all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])\n", "# unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)\n", "# sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)\n", "# training_set = sentim_analyzer.apply_features(training_docs)\n", "# test_set = sentim_analyzer.apply_features(testing_docs)\n", "sentim_analyzer = SentimentAnalyzer()\n", "\n", "def get_nltk_data(tokens):\n", " neg_tokens = sentim_analyzer.all_words([mark_negation(tokens)])\n", " unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)\n", " sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)\n", " return sentim_analyzer.apply_features(tokens)\n", "\n", "\n", "# def get_unigram_feats(neg_tokens):\n", " \n", "# return unigram_feats\n", "nltk_df = pd.DataFrame()\n", "nltk_df['nltk_data'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)\n", "\n", "# all_df['nltk']\n", "# all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)\n", "# all_df['nltk_unfil'] = all_df.apply(lambda x: get_nltk_data(x['tokens']), axis=1)" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "all_df['nltk_all'] = 0" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0 0\n", "1 0\n", "2 0\n", "3 0\n", "4 0\n", " ..\n", "995 0\n", "996 0\n", "997 0\n", "998 0\n", "999 0\n", "Name: nltk_all, Length: 2000, dtype: int64" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df['nltk_all']" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swtopwords_unfiltopwords_fil...v_pos_fdv_freq_words_unfilvader_fd_all_unfilv_compound_fd_ufv_neg_fd_ufv_neu_fd_ufv_pos_fd_ufnltk_negsunigram_featsnltk_all
0bad . bad . \\nbad . \\nthat one word seems to p...N[bad ., bad ., bad ., that one word seems to p...67[bad, bad, bad, that, one, word, seems, to, pr...1071[bad, bad, bad, one, word, seems, pretty, much...515[(the, 60), (a, 35), (to, 34), (of, 24), (this...[(movie, 17), (bad, 8), (one, 7), (meyer, 6), ......0.219the a to of this that i in is movie it and you...{'neg': 0.046, 'neu': 0.954, 'pos': 0.0, 'comp...-0.30710.0460.9540.000[bad, bad, bad, that, one, word, seems, to, pr...[the_NEG, to_NEG, a_NEG, of_NEG, this_NEG, i_N...0
1isn't it the ultimate sign of a movie's cinema...N[isn't it the ultimate sign of a movie's cinem...32[is, it, the, ultimate, sign, of, a, movie, ci...553[ultimate, sign, movie, cinematic, ineptitude,...297[(the, 28), (a, 18), (of, 16), (to, 14), (i, 1...[(movie, 7), (one, 6), (first, 5), (much, 4), ......0.173the a of to i is it and movie this in some one...{'neg': 0.1, 'neu': 0.9, 'pos': 0.0, 'compound...-0.62620.1000.9000.000[is, it, the, ultimate, sign, of, a, movie, ci...[the_NEG, a_NEG, of_NEG, i_NEG, to_NEG, is_NEG...0
2\" gordy \" is not a movie , it is a 90-minute-...N[ \" gordy \" is not a movie , it is a 90-minute...23[gordy, is, not, a, movie, it, is, a, sesame, ...478[gordy, movie, sesame, street, skit, bad, one,...239[(the, 25), (and, 21), (to, 18), (is, 17), (a,...[(gordy, 8), (movie, 5), (one, 4), (stupid, 4)......0.103the and to is a it of this gordy that but on m...{'neg': 0.231, 'neu': 0.769, 'pos': 0.0, 'comp...-0.94130.2310.7690.000[gordy, is, not, a_NEG, movie_NEG, it_NEG, is_...[the_NEG, and_NEG, to_NEG, a_NEG, is_NEG, it_N...0
3disconnect the phone line . \\ndon't accept the...N[disconnect the phone line ., don't accept the...37[disconnect, the, phone, line, do, accept, the...604[disconnect, phone, line, accept, charges, any...323[(the, 41), (of, 17), (a, 17), (to, 16), (and,...[(hanging, 9), (sisters, 5), (ryan, 4), (time,......0.248the of a to and is up hanging in as for an tha...{'neg': 0.0, 'neu': 0.869, 'pos': 0.131, 'comp...0.78760.0000.8690.131[disconnect, the, phone, line, do, accept, the...[the, the_NEG, a_NEG, is_NEG, and, of_NEG, to,...0
4when robert forster found himself famous again...N[when robert forster found himself famous agai...29[when, robert, forster, found, himself, famous...386[robert, forster, found, famous, appearing, ja...185[(the, 21), (it, 11), (i, 10), (to, 10), (of, ...[(film, 5), (movie, 5), (american, 4), (perfek......0.000the it i to of and a was is you for film this ...{'neg': 0.056, 'neu': 0.944, 'pos': 0.0, 'comp...-0.42150.0560.9440.000[when, robert, forster, found, himself, famous...[the_NEG, it_NEG, of_NEG, and_NEG, i_NEG, to_N...0
..................................................................
995one of the funniest carry on movies and the th...P[one of the funniest carry on movies and the t...25[one, of, the, funniest, carry, on, movies, an...434[one, funniest, carry, movies, third, medical,...241[(the, 26), (and, 21), (of, 11), (a, 10), (is,...[(nookey, 9), (hawtrey, 5), (carry, 4), (dr, 4......0.266the and of a is to on nookey as in who from hi...{'neg': 0.041, 'neu': 0.862, 'pos': 0.097, 'co...0.45760.0410.8620.097[one, of, the, funniest, carry, on, movies, an...[the, and, the_NEG, to, nookey, and_NEG, of, a...0
996i remember making a pact , right after `patch ...P[i remember making a pact , right after `patch...40[i, remember, making, a, pact, right, after, p...652[remember, making, pact, right, patch, adams, ...361[(the, 44), (of, 29), (and, 19), (a, 15), (it,...[(music, 8), (heart, 7), (craven, 6), (movie, ......0.236the of and a it to is with in but her music he...{'neg': 0.0, 'neu': 0.866, 'pos': 0.134, 'comp...0.80470.0000.8660.134[i, remember, making, a, pact, right, after, p...[the_NEG, of_NEG, and_NEG, it_NEG, a_NEG, is_N...0
997barely scrapping by playing at a nyc piano bar...P[barely scrapping by playing at a nyc piano ba...23[barely, scrapping, by, playing, at, a, nyc, p...345[barely, scrapping, playing, nyc, piano, bar, ...177[(a, 23), (is, 16), (the, 13), (and, 10), (of,...[(like, 4), (hutton, 3), (old, 3), (high, 2), ......0.196a is the and of with his for in to like she it...{'neg': 0.056, 'neu': 0.783, 'pos': 0.162, 'co...0.72730.0560.7830.162[barely, scrapping, by, playing, at, a, nyc, p...[a_NEG, is_NEG, a, the, with_NEG, the_NEG, for...0
998if the current trends of hollywood filmmaking ...P[if the current trends of hollywood filmmaking...34[if, the, current, trends, of, hollywood, film...730[current, trends, hollywood, filmmaking, conti...428[(the, 49), (of, 31), (and, 19), (in, 18), (to...[(one, 7), (like, 5), (l, 5), (hollywood, 4), ......0.166the of and in to that a is his by one as for l...{'neg': 0.0, 'neu': 0.859, 'pos': 0.141, 'comp...0.75060.0000.8590.141[if, the, current, trends, of, hollywood, film...[the, the_NEG, of_NEG, of, and_NEG, to, in_NEG...0
999capsule : the director of cure brings a weird ...P[capsule : the director of cure brings a weird...45[capsule, the, director, of, cure, brings, a, ...641[capsule, director, cure, brings, weird, compl...340[(the, 33), (to, 28), (and, 21), (a, 18), (of,...[(computer, 11), (kurosawa, 8), (one, 5), (see......0.136the to and a of is his computer are with on no...{'neg': 0.082, 'neu': 0.828, 'pos': 0.09, 'com...0.34970.0820.8280.090[capsule, the, director, of, cure, brings, a, ...[the_NEG, to_NEG, and_NEG, a_NEG, of_NEG, is_N...0
\n", "

2000 rows × 40 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 bad . bad . \\nbad . \\nthat one word seems to p... N \n", "1 isn't it the ultimate sign of a movie's cinema... N \n", "2 \" gordy \" is not a movie , it is a 90-minute-... N \n", "3 disconnect the phone line . \\ndon't accept the... N \n", "4 when robert forster found himself famous again... N \n", ".. ... .. \n", "995 one of the funniest carry on movies and the th... P \n", "996 i remember making a pact , right after `patch ... P \n", "997 barely scrapping by playing at a nyc piano bar... P \n", "998 if the current trends of hollywood filmmaking ... P \n", "999 capsule : the director of cure brings a weird ... P \n", "\n", " sentences num_sentences \\\n", "0 [bad ., bad ., bad ., that one word seems to p... 67 \n", "1 [isn't it the ultimate sign of a movie's cinem... 32 \n", "2 [ \" gordy \" is not a movie , it is a 90-minute... 23 \n", "3 [disconnect the phone line ., don't accept the... 37 \n", "4 [when robert forster found himself famous agai... 29 \n", ".. ... ... \n", "995 [one of the funniest carry on movies and the t... 25 \n", "996 [i remember making a pact , right after `patch... 40 \n", "997 [barely scrapping by playing at a nyc piano ba... 23 \n", "998 [if the current trends of hollywood filmmaking... 34 \n", "999 [capsule : the director of cure brings a weird... 45 \n", "\n", " tokens num_tokens \\\n", "0 [bad, bad, bad, that, one, word, seems, to, pr... 1071 \n", "1 [is, it, the, ultimate, sign, of, a, movie, ci... 553 \n", "2 [gordy, is, not, a, movie, it, is, a, sesame, ... 478 \n", "3 [disconnect, the, phone, line, do, accept, the... 604 \n", "4 [when, robert, forster, found, himself, famous... 386 \n", ".. ... ... \n", "995 [one, of, the, funniest, carry, on, movies, an... 434 \n", "996 [i, remember, making, a, pact, right, after, p... 652 \n", "997 [barely, scrapping, by, playing, at, a, nyc, p... 345 \n", "998 [if, the, current, trends, of, hollywood, film... 730 \n", "999 [capsule, the, director, of, cure, brings, a, ... 641 \n", "\n", " no_sw num_no_sw \\\n", "0 [bad, bad, bad, one, word, seems, pretty, much... 515 \n", "1 [ultimate, sign, movie, cinematic, ineptitude,... 297 \n", "2 [gordy, movie, sesame, street, skit, bad, one,... 239 \n", "3 [disconnect, phone, line, accept, charges, any... 323 \n", "4 [robert, forster, found, famous, appearing, ja... 185 \n", ".. ... ... \n", "995 [one, funniest, carry, movies, third, medical,... 241 \n", "996 [remember, making, pact, right, patch, adams, ... 361 \n", "997 [barely, scrapping, playing, nyc, piano, bar, ... 177 \n", "998 [current, trends, hollywood, filmmaking, conti... 428 \n", "999 [capsule, director, cure, brings, weird, compl... 340 \n", "\n", " topwords_unfil \\\n", "0 [(the, 60), (a, 35), (to, 34), (of, 24), (this... \n", "1 [(the, 28), (a, 18), (of, 16), (to, 14), (i, 1... \n", "2 [(the, 25), (and, 21), (to, 18), (is, 17), (a,... \n", "3 [(the, 41), (of, 17), (a, 17), (to, 16), (and,... \n", "4 [(the, 21), (it, 11), (i, 10), (to, 10), (of, ... \n", ".. ... \n", "995 [(the, 26), (and, 21), (of, 11), (a, 10), (is,... \n", "996 [(the, 44), (of, 29), (and, 19), (a, 15), (it,... \n", "997 [(a, 23), (is, 16), (the, 13), (and, 10), (of,... \n", "998 [(the, 49), (of, 31), (and, 19), (in, 18), (to... \n", "999 [(the, 33), (to, 28), (and, 21), (a, 18), (of,... \n", "\n", " topwords_fil ... v_pos_fd \\\n", "0 [(movie, 17), (bad, 8), (one, 7), (meyer, 6), ... ... 0.219 \n", "1 [(movie, 7), (one, 6), (first, 5), (much, 4), ... ... 0.173 \n", "2 [(gordy, 8), (movie, 5), (one, 4), (stupid, 4)... ... 0.103 \n", "3 [(hanging, 9), (sisters, 5), (ryan, 4), (time,... ... 0.248 \n", "4 [(film, 5), (movie, 5), (american, 4), (perfek... ... 0.000 \n", ".. ... ... ... \n", "995 [(nookey, 9), (hawtrey, 5), (carry, 4), (dr, 4... ... 0.266 \n", "996 [(music, 8), (heart, 7), (craven, 6), (movie, ... ... 0.236 \n", "997 [(like, 4), (hutton, 3), (old, 3), (high, 2), ... ... 0.196 \n", "998 [(one, 7), (like, 5), (l, 5), (hollywood, 4), ... ... 0.166 \n", "999 [(computer, 11), (kurosawa, 8), (one, 5), (see... ... 0.136 \n", "\n", " v_freq_words_unfil \\\n", "0 the a to of this that i in is movie it and you... \n", "1 the a of to i is it and movie this in some one... \n", "2 the and to is a it of this gordy that but on m... \n", "3 the of a to and is up hanging in as for an tha... \n", "4 the it i to of and a was is you for film this ... \n", ".. ... \n", "995 the and of a is to on nookey as in who from hi... \n", "996 the of and a it to is with in but her music he... \n", "997 a is the and of with his for in to like she it... \n", "998 the of and in to that a is his by one as for l... \n", "999 the to and a of is his computer are with on no... \n", "\n", " vader_fd_all_unfil v_compound_fd_uf \\\n", "0 {'neg': 0.046, 'neu': 0.954, 'pos': 0.0, 'comp... -0.3071 \n", "1 {'neg': 0.1, 'neu': 0.9, 'pos': 0.0, 'compound... -0.6262 \n", "2 {'neg': 0.231, 'neu': 0.769, 'pos': 0.0, 'comp... -0.9413 \n", "3 {'neg': 0.0, 'neu': 0.869, 'pos': 0.131, 'comp... 0.7876 \n", "4 {'neg': 0.056, 'neu': 0.944, 'pos': 0.0, 'comp... -0.4215 \n", ".. ... ... \n", "995 {'neg': 0.041, 'neu': 0.862, 'pos': 0.097, 'co... 0.4576 \n", "996 {'neg': 0.0, 'neu': 0.866, 'pos': 0.134, 'comp... 0.8047 \n", "997 {'neg': 0.056, 'neu': 0.783, 'pos': 0.162, 'co... 0.7273 \n", "998 {'neg': 0.0, 'neu': 0.859, 'pos': 0.141, 'comp... 0.7506 \n", "999 {'neg': 0.082, 'neu': 0.828, 'pos': 0.09, 'com... 0.3497 \n", "\n", " v_neg_fd_uf v_neu_fd_uf v_pos_fd_uf \\\n", "0 0.046 0.954 0.000 \n", "1 0.100 0.900 0.000 \n", "2 0.231 0.769 0.000 \n", "3 0.000 0.869 0.131 \n", "4 0.056 0.944 0.000 \n", ".. ... ... ... \n", "995 0.041 0.862 0.097 \n", "996 0.000 0.866 0.134 \n", "997 0.056 0.783 0.162 \n", "998 0.000 0.859 0.141 \n", "999 0.082 0.828 0.090 \n", "\n", " nltk_negs \\\n", "0 [bad, bad, bad, that, one, word, seems, to, pr... \n", "1 [is, it, the, ultimate, sign, of, a, movie, ci... \n", "2 [gordy, is, not, a_NEG, movie_NEG, it_NEG, is_... \n", "3 [disconnect, the, phone, line, do, accept, the... \n", "4 [when, robert, forster, found, himself, famous... \n", ".. ... \n", "995 [one, of, the, funniest, carry, on, movies, an... \n", "996 [i, remember, making, a, pact, right, after, p... \n", "997 [barely, scrapping, by, playing, at, a, nyc, p... \n", "998 [if, the, current, trends, of, hollywood, film... \n", "999 [capsule, the, director, of, cure, brings, a, ... \n", "\n", " unigram_feats nltk_all \n", "0 [the_NEG, to_NEG, a_NEG, of_NEG, this_NEG, i_N... 0 \n", "1 [the_NEG, a_NEG, of_NEG, i_NEG, to_NEG, is_NEG... 0 \n", "2 [the_NEG, and_NEG, to_NEG, a_NEG, is_NEG, it_N... 0 \n", "3 [the, the_NEG, a_NEG, is_NEG, and, of_NEG, to,... 0 \n", "4 [the_NEG, it_NEG, of_NEG, and_NEG, i_NEG, to_N... 0 \n", ".. ... ... \n", "995 [the, and, the_NEG, to, nookey, and_NEG, of, a... 0 \n", "996 [the_NEG, of_NEG, and_NEG, it_NEG, a_NEG, is_N... 0 \n", "997 [a_NEG, is_NEG, a, the, with_NEG, the_NEG, for... 0 \n", "998 [the, the_NEG, of_NEG, of, and_NEG, to, in_NEG... 0 \n", "999 [the_NEG, to_NEG, and_NEG, a_NEG, of_NEG, is_N... 0 \n", "\n", "[2000 rows x 40 columns]" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import casual_tokenize\n", "from collections import Counter\n", "all_df['bow_nosw'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNsentencesnum_sentencestokensnum_tokensno_swnum_no_swtopwords_unfiltopwords_fil...v_freq_words_unfilvader_fd_all_unfilv_compound_fd_ufv_neg_fd_ufv_neu_fd_ufv_pos_fd_ufnltk_negsunigram_featsnltk_allbow_nosw
0bad . bad . \\nbad . \\nthat one word seems to p...N[bad ., bad ., bad ., that one word seems to p...67[bad, bad, bad, that, one, word, seems, to, pr...1071[bad, bad, bad, one, word, seems, pretty, much...515[(the, 60), (a, 35), (to, 34), (of, 24), (this...[(movie, 17), (bad, 8), (one, 7), (meyer, 6), ......the a to of this that i in is movie it and you...{'neg': 0.046, 'neu': 0.954, 'pos': 0.0, 'comp...-0.30710.0460.9540.000[bad, bad, bad, that, one, word, seems, to, pr...[the_NEG, to_NEG, a_NEG, of_NEG, this_NEG, i_N...0{'bad': 8, '.': 62, 'that': 19, 'one': 7, 'wor...
1isn't it the ultimate sign of a movie's cinema...N[isn't it the ultimate sign of a movie's cinem...32[is, it, the, ultimate, sign, of, a, movie, ci...553[ultimate, sign, movie, cinematic, ineptitude,...297[(the, 28), (a, 18), (of, 16), (to, 14), (i, 1...[(movie, 7), (one, 6), (first, 5), (much, 4), ......the a of to i is it and movie this in some one...{'neg': 0.1, 'neu': 0.9, 'pos': 0.0, 'compound...-0.62620.1000.9000.000[is, it, the, ultimate, sign, of, a, movie, ci...[the_NEG, a_NEG, of_NEG, i_NEG, to_NEG, is_NEG...0{'isn't': 2, 'it': 9, 'the': 28, 'ultimate': 1...
2\" gordy \" is not a movie , it is a 90-minute-...N[ \" gordy \" is not a movie , it is a 90-minute...23[gordy, is, not, a, movie, it, is, a, sesame, ...478[gordy, movie, sesame, street, skit, bad, one,...239[(the, 25), (and, 21), (to, 18), (is, 17), (a,...[(gordy, 8), (movie, 5), (one, 4), (stupid, 4)......the and to is a it of this gordy that but on m...{'neg': 0.231, 'neu': 0.769, 'pos': 0.0, 'comp...-0.94130.2310.7690.000[gordy, is, not, a_NEG, movie_NEG, it_NEG, is_...[the_NEG, and_NEG, to_NEG, a_NEG, is_NEG, it_N...0{'\"': 12, 'gordy': 8, 'is': 16, 'not': 3, 'a':...
3disconnect the phone line . \\ndon't accept the...N[disconnect the phone line ., don't accept the...37[disconnect, the, phone, line, do, accept, the...604[disconnect, phone, line, accept, charges, any...323[(the, 41), (of, 17), (a, 17), (to, 16), (and,...[(hanging, 9), (sisters, 5), (ryan, 4), (time,......the of a to and is up hanging in as for an tha...{'neg': 0.0, 'neu': 0.869, 'pos': 0.131, 'comp...0.78760.0000.8690.131[disconnect, the, phone, line, do, accept, the...[the, the_NEG, a_NEG, is_NEG, and, of_NEG, to,...0{'disconnect': 1, 'the': 41, 'phone': 2, 'line...
4when robert forster found himself famous again...N[when robert forster found himself famous agai...29[when, robert, forster, found, himself, famous...386[robert, forster, found, famous, appearing, ja...185[(the, 21), (it, 11), (i, 10), (to, 10), (of, ...[(film, 5), (movie, 5), (american, 4), (perfek......the it i to of and a was is you for film this ...{'neg': 0.056, 'neu': 0.944, 'pos': 0.0, 'comp...-0.42150.0560.9440.000[when, robert, forster, found, himself, famous...[the_NEG, it_NEG, of_NEG, and_NEG, i_NEG, to_N...0{'when': 2, 'robert': 2, 'forster': 3, 'found'...
..................................................................
995one of the funniest carry on movies and the th...P[one of the funniest carry on movies and the t...25[one, of, the, funniest, carry, on, movies, an...434[one, funniest, carry, movies, third, medical,...241[(the, 26), (and, 21), (of, 11), (a, 10), (is,...[(nookey, 9), (hawtrey, 5), (carry, 4), (dr, 4......the and of a is to on nookey as in who from hi...{'neg': 0.041, 'neu': 0.862, 'pos': 0.097, 'co...0.45760.0410.8620.097[one, of, the, funniest, carry, on, movies, an...[the, and, the_NEG, to, nookey, and_NEG, of, a...0{'one': 1, 'of': 11, 'the': 26, 'funniest': 1,...
996i remember making a pact , right after `patch ...P[i remember making a pact , right after `patch...40[i, remember, making, a, pact, right, after, p...652[remember, making, pact, right, patch, adams, ...361[(the, 44), (of, 29), (and, 19), (a, 15), (it,...[(music, 8), (heart, 7), (craven, 6), (movie, ......the of and a it to is with in but her music he...{'neg': 0.0, 'neu': 0.866, 'pos': 0.134, 'comp...0.80470.0000.8660.134[i, remember, making, a, pact, right, after, p...[the_NEG, of_NEG, and_NEG, it_NEG, a_NEG, is_N...0{'i': 1, 'remember': 1, 'making': 1, 'a': 15, ...
997barely scrapping by playing at a nyc piano bar...P[barely scrapping by playing at a nyc piano ba...23[barely, scrapping, by, playing, at, a, nyc, p...345[barely, scrapping, playing, nyc, piano, bar, ...177[(a, 23), (is, 16), (the, 13), (and, 10), (of,...[(like, 4), (hutton, 3), (old, 3), (high, 2), ......a is the and of with his for in to like she it...{'neg': 0.056, 'neu': 0.783, 'pos': 0.162, 'co...0.72730.0560.7830.162[barely, scrapping, by, playing, at, a, nyc, p...[a_NEG, is_NEG, a, the, with_NEG, the_NEG, for...0{'barely': 1, 'scrapping': 1, 'by': 2, 'playin...
998if the current trends of hollywood filmmaking ...P[if the current trends of hollywood filmmaking...34[if, the, current, trends, of, hollywood, film...730[current, trends, hollywood, filmmaking, conti...428[(the, 49), (of, 31), (and, 19), (in, 18), (to...[(one, 7), (like, 5), (l, 5), (hollywood, 4), ......the of and in to that a is his by one as for l...{'neg': 0.0, 'neu': 0.859, 'pos': 0.141, 'comp...0.75060.0000.8590.141[if, the, current, trends, of, hollywood, film...[the, the_NEG, of_NEG, of, and_NEG, to, in_NEG...0{'if': 1, 'the': 49, 'current': 1, 'trends': 1...
999capsule : the director of cure brings a weird ...P[capsule : the director of cure brings a weird...45[capsule, the, director, of, cure, brings, a, ...641[capsule, director, cure, brings, weird, compl...340[(the, 33), (to, 28), (and, 21), (a, 18), (of,...[(computer, 11), (kurosawa, 8), (one, 5), (see......the to and a of is his computer are with on no...{'neg': 0.082, 'neu': 0.828, 'pos': 0.09, 'com...0.34970.0820.8280.090[capsule, the, director, of, cure, brings, a, ...[the_NEG, to_NEG, and_NEG, a_NEG, of_NEG, is_N...0{'capsule': 1, ':': 1, 'the': 33, 'director': ...
\n", "

2000 rows × 41 columns

\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 bad . bad . \\nbad . \\nthat one word seems to p... N \n", "1 isn't it the ultimate sign of a movie's cinema... N \n", "2 \" gordy \" is not a movie , it is a 90-minute-... N \n", "3 disconnect the phone line . \\ndon't accept the... N \n", "4 when robert forster found himself famous again... N \n", ".. ... .. \n", "995 one of the funniest carry on movies and the th... P \n", "996 i remember making a pact , right after `patch ... P \n", "997 barely scrapping by playing at a nyc piano bar... P \n", "998 if the current trends of hollywood filmmaking ... P \n", "999 capsule : the director of cure brings a weird ... P \n", "\n", " sentences num_sentences \\\n", "0 [bad ., bad ., bad ., that one word seems to p... 67 \n", "1 [isn't it the ultimate sign of a movie's cinem... 32 \n", "2 [ \" gordy \" is not a movie , it is a 90-minute... 23 \n", "3 [disconnect the phone line ., don't accept the... 37 \n", "4 [when robert forster found himself famous agai... 29 \n", ".. ... ... \n", "995 [one of the funniest carry on movies and the t... 25 \n", "996 [i remember making a pact , right after `patch... 40 \n", "997 [barely scrapping by playing at a nyc piano ba... 23 \n", "998 [if the current trends of hollywood filmmaking... 34 \n", "999 [capsule : the director of cure brings a weird... 45 \n", "\n", " tokens num_tokens \\\n", "0 [bad, bad, bad, that, one, word, seems, to, pr... 1071 \n", "1 [is, it, the, ultimate, sign, of, a, movie, ci... 553 \n", "2 [gordy, is, not, a, movie, it, is, a, sesame, ... 478 \n", "3 [disconnect, the, phone, line, do, accept, the... 604 \n", "4 [when, robert, forster, found, himself, famous... 386 \n", ".. ... ... \n", "995 [one, of, the, funniest, carry, on, movies, an... 434 \n", "996 [i, remember, making, a, pact, right, after, p... 652 \n", "997 [barely, scrapping, by, playing, at, a, nyc, p... 345 \n", "998 [if, the, current, trends, of, hollywood, film... 730 \n", "999 [capsule, the, director, of, cure, brings, a, ... 641 \n", "\n", " no_sw num_no_sw \\\n", "0 [bad, bad, bad, one, word, seems, pretty, much... 515 \n", "1 [ultimate, sign, movie, cinematic, ineptitude,... 297 \n", "2 [gordy, movie, sesame, street, skit, bad, one,... 239 \n", "3 [disconnect, phone, line, accept, charges, any... 323 \n", "4 [robert, forster, found, famous, appearing, ja... 185 \n", ".. ... ... \n", "995 [one, funniest, carry, movies, third, medical,... 241 \n", "996 [remember, making, pact, right, patch, adams, ... 361 \n", "997 [barely, scrapping, playing, nyc, piano, bar, ... 177 \n", "998 [current, trends, hollywood, filmmaking, conti... 428 \n", "999 [capsule, director, cure, brings, weird, compl... 340 \n", "\n", " topwords_unfil \\\n", "0 [(the, 60), (a, 35), (to, 34), (of, 24), (this... \n", "1 [(the, 28), (a, 18), (of, 16), (to, 14), (i, 1... \n", "2 [(the, 25), (and, 21), (to, 18), (is, 17), (a,... \n", "3 [(the, 41), (of, 17), (a, 17), (to, 16), (and,... \n", "4 [(the, 21), (it, 11), (i, 10), (to, 10), (of, ... \n", ".. ... \n", "995 [(the, 26), (and, 21), (of, 11), (a, 10), (is,... \n", "996 [(the, 44), (of, 29), (and, 19), (a, 15), (it,... \n", "997 [(a, 23), (is, 16), (the, 13), (and, 10), (of,... \n", "998 [(the, 49), (of, 31), (and, 19), (in, 18), (to... \n", "999 [(the, 33), (to, 28), (and, 21), (a, 18), (of,... \n", "\n", " topwords_fil ... \\\n", "0 [(movie, 17), (bad, 8), (one, 7), (meyer, 6), ... ... \n", "1 [(movie, 7), (one, 6), (first, 5), (much, 4), ... ... \n", "2 [(gordy, 8), (movie, 5), (one, 4), (stupid, 4)... ... \n", "3 [(hanging, 9), (sisters, 5), (ryan, 4), (time,... ... \n", "4 [(film, 5), (movie, 5), (american, 4), (perfek... ... \n", ".. ... ... \n", "995 [(nookey, 9), (hawtrey, 5), (carry, 4), (dr, 4... ... \n", "996 [(music, 8), (heart, 7), (craven, 6), (movie, ... ... \n", "997 [(like, 4), (hutton, 3), (old, 3), (high, 2), ... ... \n", "998 [(one, 7), (like, 5), (l, 5), (hollywood, 4), ... ... \n", "999 [(computer, 11), (kurosawa, 8), (one, 5), (see... ... \n", "\n", " v_freq_words_unfil \\\n", "0 the a to of this that i in is movie it and you... \n", "1 the a of to i is it and movie this in some one... \n", "2 the and to is a it of this gordy that but on m... \n", "3 the of a to and is up hanging in as for an tha... \n", "4 the it i to of and a was is you for film this ... \n", ".. ... \n", "995 the and of a is to on nookey as in who from hi... \n", "996 the of and a it to is with in but her music he... \n", "997 a is the and of with his for in to like she it... \n", "998 the of and in to that a is his by one as for l... \n", "999 the to and a of is his computer are with on no... \n", "\n", " vader_fd_all_unfil v_compound_fd_uf \\\n", "0 {'neg': 0.046, 'neu': 0.954, 'pos': 0.0, 'comp... -0.3071 \n", "1 {'neg': 0.1, 'neu': 0.9, 'pos': 0.0, 'compound... -0.6262 \n", "2 {'neg': 0.231, 'neu': 0.769, 'pos': 0.0, 'comp... -0.9413 \n", "3 {'neg': 0.0, 'neu': 0.869, 'pos': 0.131, 'comp... 0.7876 \n", "4 {'neg': 0.056, 'neu': 0.944, 'pos': 0.0, 'comp... -0.4215 \n", ".. ... ... \n", "995 {'neg': 0.041, 'neu': 0.862, 'pos': 0.097, 'co... 0.4576 \n", "996 {'neg': 0.0, 'neu': 0.866, 'pos': 0.134, 'comp... 0.8047 \n", "997 {'neg': 0.056, 'neu': 0.783, 'pos': 0.162, 'co... 0.7273 \n", "998 {'neg': 0.0, 'neu': 0.859, 'pos': 0.141, 'comp... 0.7506 \n", "999 {'neg': 0.082, 'neu': 0.828, 'pos': 0.09, 'com... 0.3497 \n", "\n", " v_neg_fd_uf v_neu_fd_uf v_pos_fd_uf \\\n", "0 0.046 0.954 0.000 \n", "1 0.100 0.900 0.000 \n", "2 0.231 0.769 0.000 \n", "3 0.000 0.869 0.131 \n", "4 0.056 0.944 0.000 \n", ".. ... ... ... \n", "995 0.041 0.862 0.097 \n", "996 0.000 0.866 0.134 \n", "997 0.056 0.783 0.162 \n", "998 0.000 0.859 0.141 \n", "999 0.082 0.828 0.090 \n", "\n", " nltk_negs \\\n", "0 [bad, bad, bad, that, one, word, seems, to, pr... \n", "1 [is, it, the, ultimate, sign, of, a, movie, ci... \n", "2 [gordy, is, not, a_NEG, movie_NEG, it_NEG, is_... \n", "3 [disconnect, the, phone, line, do, accept, the... \n", "4 [when, robert, forster, found, himself, famous... \n", ".. ... \n", "995 [one, of, the, funniest, carry, on, movies, an... \n", "996 [i, remember, making, a, pact, right, after, p... \n", "997 [barely, scrapping, by, playing, at, a, nyc, p... \n", "998 [if, the, current, trends, of, hollywood, film... \n", "999 [capsule, the, director, of, cure, brings, a, ... \n", "\n", " unigram_feats nltk_all \\\n", "0 [the_NEG, to_NEG, a_NEG, of_NEG, this_NEG, i_N... 0 \n", "1 [the_NEG, a_NEG, of_NEG, i_NEG, to_NEG, is_NEG... 0 \n", "2 [the_NEG, and_NEG, to_NEG, a_NEG, is_NEG, it_N... 0 \n", "3 [the, the_NEG, a_NEG, is_NEG, and, of_NEG, to,... 0 \n", "4 [the_NEG, it_NEG, of_NEG, and_NEG, i_NEG, to_N... 0 \n", ".. ... ... \n", "995 [the, and, the_NEG, to, nookey, and_NEG, of, a... 0 \n", "996 [the_NEG, of_NEG, and_NEG, it_NEG, a_NEG, is_N... 0 \n", "997 [a_NEG, is_NEG, a, the, with_NEG, the_NEG, for... 0 \n", "998 [the, the_NEG, of_NEG, of, and_NEG, to, in_NEG... 0 \n", "999 [the_NEG, to_NEG, and_NEG, a_NEG, of_NEG, is_N... 0 \n", "\n", " bow_nosw \n", "0 {'bad': 8, '.': 62, 'that': 19, 'one': 7, 'wor... \n", "1 {'isn't': 2, 'it': 9, 'the': 28, 'ultimate': 1... \n", "2 {'\"': 12, 'gordy': 8, 'is': 16, 'not': 3, 'a':... \n", "3 {'disconnect': 1, 'the': 41, 'phone': 2, 'line... \n", "4 {'when': 2, 'robert': 2, 'forster': 3, 'found'... \n", ".. ... \n", "995 {'one': 1, 'of': 11, 'the': 26, 'funniest': 1,... \n", "996 {'i': 1, 'remember': 1, 'making': 1, 'a': 15, ... \n", "997 {'barely': 1, 'scrapping': 1, 'by': 2, 'playin... \n", "998 {'if': 1, 'the': 49, 'current': 1, 'trends': 1... \n", "999 {'capsule': 1, ':': 1, 'the': 33, 'director': ... \n", "\n", "[2000 rows x 41 columns]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "\n", "# from nltk.tokenize import casual_tokenize\n", "# from collections import Counter\n", "# # all_df['bow_nosw'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }