{ "cells": [ { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# import nltk\n", "# from nltk.tokenize import word_tokenize\n", "# from nltk.probability import FreqDist\n", "# file = open('WK2/moviereview_arff.arff')\n", "# tokens = []\n", "# for line in file:\n", "# # print(type(line))\n", "# # tokens.append(word_tokenize(line))\n", "# len(tokens)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.probability import FreqDist\n", "import pandas as pd\n", "file = open('WK2/moviereview.csv')\n", "all_df = pd.DataFrame(file)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# freq_dist_sent = []\n", "# for sent in tokenized_sentences[1:2]:\n", "# fdist = FreqDist(sent)\n", "# print(len(sent))\n", "# print(fdist['bad'])\n", "# # print(fdist.items())\n", "# # print(sent)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize\n", "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "all_df['tokenized'] = all_df.apply(lambda x: get_tokens(x[0]),axis=1)\n", "all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0tokenizedtokenized_count
0text,reviewclass\\n[text, reviewclass]2
1'plot : two teen couples go to a church party ...[two, teen, couples, go, to, a, church, party,...638
2'the happy bastard\\'s quick movie review \\ndam...[happy, quick, movie, review, that, bug, got, ...215
3'it is movies like these that make a jaded mov...[is, movies, like, these, that, make, a, jaded...444
4' \\\" quest for camelot \\\" is warner bros . \\' ...[quest, for, camelot, is, warner, bros, first,...410
............
1996'wow ! what a movie . \\nit\\'s everything a mov...[what, a, movie, everything, a, movie, can, be...702
1997'richard gere can be a commanding actor , but ...[gere, can, be, a, commanding, actor, but, not...286
1998'glory--starring matthew broderick , denzel wa...[starring, matthew, broderick, denzel, washing...990
1999'steven spielberg\\'s second epic film on world...[second, epic, film, on, world, war, ii, is, a...538
2000'truman ( \\\" true-man \\\" ) burbank is the perf...[burbank, is, the, perfect, name, for, jim, ch...901
\n", "

2001 rows × 3 columns

\n", "
" ], "text/plain": [ " 0 \\\n", "0 text,reviewclass\\n \n", "1 'plot : two teen couples go to a church party ... \n", "2 'the happy bastard\\'s quick movie review \\ndam... \n", "3 'it is movies like these that make a jaded mov... \n", "4 ' \\\" quest for camelot \\\" is warner bros . \\' ... \n", "... ... \n", "1996 'wow ! what a movie . \\nit\\'s everything a mov... \n", "1997 'richard gere can be a commanding actor , but ... \n", "1998 'glory--starring matthew broderick , denzel wa... \n", "1999 'steven spielberg\\'s second epic film on world... \n", "2000 'truman ( \\\" true-man \\\" ) burbank is the perf... \n", "\n", " tokenized tokenized_count \n", "0 [text, reviewclass] 2 \n", "1 [two, teen, couples, go, to, a, church, party,... 638 \n", "2 [happy, quick, movie, review, that, bug, got, ... 215 \n", "3 [is, movies, like, these, that, make, a, jaded... 444 \n", "4 [quest, for, camelot, is, warner, bros, first,... 410 \n", "... ... ... \n", "1996 [what, a, movie, everything, a, movie, can, be... 702 \n", "1997 [gere, can, be, a, commanding, actor, but, not... 286 \n", "1998 [starring, matthew, broderick, denzel, washing... 990 \n", "1999 [second, epic, film, on, world, war, ii, is, a... 538 \n", "2000 [burbank, is, the, perfect, name, for, jim, ch... 901 \n", "\n", "[2001 rows x 3 columns]" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "from nltk.corpus import stopwords\n", "stop_words = set(stopwords.words(\"english\"))\n", "def remove_stopwords(sentence):\n", " filtered_text = []\n", " for word in sentence:\n", " if word not in stop_words:\n", " filtered_text.append(word)\n", " return filtered_text\n", "all_df['no_stopwords'] = all_df.apply(lambda x: remove_stopwords(x['tokenized']),axis=1)\n", "all_df['no_stopwords_count'] = all_df.apply(lambda x: len(x['no_stopwords']),axis=1)" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0tokenizedtokenized_countno_stopwordsno_stopwords_count
0text,reviewclass\\n[text, reviewclass]2[text, reviewclass]2
1'plot : two teen couples go to a church party ...[two, teen, couples, go, to, a, church, party,...638[two, teen, couples, go, church, party, drink,...306
2'the happy bastard\\'s quick movie review \\ndam...[happy, quick, movie, review, that, bug, got, ...215[happy, quick, movie, review, bug, got, head, ...119
3'it is movies like these that make a jaded mov...[is, movies, like, these, that, make, a, jaded...444[movies, like, make, jaded, movie, viewer, tha...246
4' \\\" quest for camelot \\\" is warner bros . \\' ...[quest, for, camelot, is, warner, bros, first,...410[quest, camelot, warner, bros, first, attempt,...234
..................
1996'wow ! what a movie . \\nit\\'s everything a mov...[what, a, movie, everything, a, movie, can, be...702[movie, everything, movie, funny, dramatic, in...355
1997'richard gere can be a commanding actor , but ...[gere, can, be, a, commanding, actor, but, not...286[gere, commanding, actor, always, great, films...148
1998'glory--starring matthew broderick , denzel wa...[starring, matthew, broderick, denzel, washing...990[starring, matthew, broderick, denzel, washing...561
1999'steven spielberg\\'s second epic film on world...[second, epic, film, on, world, war, ii, is, a...538[second, epic, film, world, war, ii, unquestio...287
2000'truman ( \\\" true-man \\\" ) burbank is the perf...[burbank, is, the, perfect, name, for, jim, ch...901[burbank, perfect, name, jim, character, film,...483
\n", "

2001 rows × 5 columns

\n", "
" ], "text/plain": [ " 0 \\\n", "0 text,reviewclass\\n \n", "1 'plot : two teen couples go to a church party ... \n", "2 'the happy bastard\\'s quick movie review \\ndam... \n", "3 'it is movies like these that make a jaded mov... \n", "4 ' \\\" quest for camelot \\\" is warner bros . \\' ... \n", "... ... \n", "1996 'wow ! what a movie . \\nit\\'s everything a mov... \n", "1997 'richard gere can be a commanding actor , but ... \n", "1998 'glory--starring matthew broderick , denzel wa... \n", "1999 'steven spielberg\\'s second epic film on world... \n", "2000 'truman ( \\\" true-man \\\" ) burbank is the perf... \n", "\n", " tokenized tokenized_count \\\n", "0 [text, reviewclass] 2 \n", "1 [two, teen, couples, go, to, a, church, party,... 638 \n", "2 [happy, quick, movie, review, that, bug, got, ... 215 \n", "3 [is, movies, like, these, that, make, a, jaded... 444 \n", "4 [quest, for, camelot, is, warner, bros, first,... 410 \n", "... ... ... \n", "1996 [what, a, movie, everything, a, movie, can, be... 702 \n", "1997 [gere, can, be, a, commanding, actor, but, not... 286 \n", "1998 [starring, matthew, broderick, denzel, washing... 990 \n", "1999 [second, epic, film, on, world, war, ii, is, a... 538 \n", "2000 [burbank, is, the, perfect, name, for, jim, ch... 901 \n", "\n", " no_stopwords no_stopwords_count \n", "0 [text, reviewclass] 2 \n", "1 [two, teen, couples, go, church, party, drink,... 306 \n", "2 [happy, quick, movie, review, bug, got, head, ... 119 \n", "3 [movies, like, make, jaded, movie, viewer, tha... 246 \n", "4 [quest, camelot, warner, bros, first, attempt,... 234 \n", "... ... ... \n", "1996 [movie, everything, movie, funny, dramatic, in... 355 \n", "1997 [gere, commanding, actor, always, great, films... 148 \n", "1998 [starring, matthew, broderick, denzel, washing... 561 \n", "1999 [second, epic, film, world, war, ii, unquestio... 287 \n", "2000 [burbank, perfect, name, jim, character, film,... 483 \n", "\n", "[2001 rows x 5 columns]" ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [], "source": [ "from nltk.probability import FreqDist\n", "def get_most_common(tokens):\n", " fdist = FreqDist(tokens)\n", " return fdist\n", "all_df['fdist'] = all_df.apply(lambda x: get_most_common(x['no_stopwords']),axis=1)" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "all_df = all_df[1:]\n", "# all_df" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:14: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " \n", "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:15: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " from ipykernel import kernelapp as app\n" ] } ], "source": [ "# In the 2000 docs, how many times was \"bad\" used\n", "# inverse of the normalized value\n", "def get_bad(fdist): \n", "# fdist['bad']\n", " return fdist['bad']\n", "\n", "# import math\n", "# (math.log10(2000/760))\n", "# print((all_df['bad']!=0).sum())\n", "\n", "def get_tfidf(fdist): \n", " return fdist['bad']*(math.log10(2000/760))\n", "\n", "all_df['bad'] = all_df.apply(lambda x: get_bad(x['fdist']),axis=1)\n", "all_df['tfidf_bad'] = all_df.apply(lambda x: get_tfidf(x['fdist']),axis=1)" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0tokenizedtokenized_countno_stopwordsno_stopwords_countfdistbadtfidf_bad
1'plot : two teen couples go to a church party ...[two, teen, couples, go, to, a, church, party,...638[two, teen, couples, go, church, party, drink,...306{'two': 2, 'teen': 4, 'couples': 1, 'go': 2, '...20.840433
2'the happy bastard\\'s quick movie review \\ndam...[happy, quick, movie, review, that, bug, got, ...215[happy, quick, movie, review, bug, got, head, ...119{'happy': 1, 'quick': 1, 'movie': 5, 'review':...00.000000
3'it is movies like these that make a jaded mov...[is, movies, like, these, that, make, a, jaded...444[movies, like, make, jaded, movie, viewer, tha...246{'movies': 1, 'like': 4, 'make': 2, 'jaded': 1...00.000000
4' \\\" quest for camelot \\\" is warner bros . \\' ...[quest, for, camelot, is, warner, bros, first,...410[quest, camelot, warner, bros, first, attempt,...234{'quest': 5, 'camelot': 4, 'warner': 1, 'bros'...00.000000
5'synopsis : a mentally unstable man undergoing...[a, mentally, unstable, man, undergoing, psych...658[mentally, unstable, man, undergoing, psychoth...346{'mentally': 1, 'unstable': 1, 'man': 2, 'unde...20.840433
...........................
1996'wow ! what a movie . \\nit\\'s everything a mov...[what, a, movie, everything, a, movie, can, be...702[movie, everything, movie, funny, dramatic, in...355{'movie': 14, 'everything': 2, 'funny': 5, 'dr...00.000000
1997'richard gere can be a commanding actor , but ...[gere, can, be, a, commanding, actor, but, not...286[gere, commanding, actor, always, great, films...148{'gere': 1, 'commanding': 1, 'actor': 1, 'alwa...00.000000
1998'glory--starring matthew broderick , denzel wa...[starring, matthew, broderick, denzel, washing...990[starring, matthew, broderick, denzel, washing...561{'starring': 1, 'matthew': 1, 'broderick': 2, ...00.000000
1999'steven spielberg\\'s second epic film on world...[second, epic, film, on, world, war, ii, is, a...538[second, epic, film, world, war, ii, unquestio...287{'second': 1, 'epic': 2, 'film': 14, 'world': ...00.000000
2000'truman ( \\\" true-man \\\" ) burbank is the perf...[burbank, is, the, perfect, name, for, jim, ch...901[burbank, perfect, name, jim, character, film,...483{'burbank': 4, 'perfect': 4, 'name': 1, 'jim':...00.000000
\n", "

2000 rows × 8 columns

\n", "
" ], "text/plain": [ " 0 \\\n", "1 'plot : two teen couples go to a church party ... \n", "2 'the happy bastard\\'s quick movie review \\ndam... \n", "3 'it is movies like these that make a jaded mov... \n", "4 ' \\\" quest for camelot \\\" is warner bros . \\' ... \n", "5 'synopsis : a mentally unstable man undergoing... \n", "... ... \n", "1996 'wow ! what a movie . \\nit\\'s everything a mov... \n", "1997 'richard gere can be a commanding actor , but ... \n", "1998 'glory--starring matthew broderick , denzel wa... \n", "1999 'steven spielberg\\'s second epic film on world... \n", "2000 'truman ( \\\" true-man \\\" ) burbank is the perf... \n", "\n", " tokenized tokenized_count \\\n", "1 [two, teen, couples, go, to, a, church, party,... 638 \n", "2 [happy, quick, movie, review, that, bug, got, ... 215 \n", "3 [is, movies, like, these, that, make, a, jaded... 444 \n", "4 [quest, for, camelot, is, warner, bros, first,... 410 \n", "5 [a, mentally, unstable, man, undergoing, psych... 658 \n", "... ... ... \n", "1996 [what, a, movie, everything, a, movie, can, be... 702 \n", "1997 [gere, can, be, a, commanding, actor, but, not... 286 \n", "1998 [starring, matthew, broderick, denzel, washing... 990 \n", "1999 [second, epic, film, on, world, war, ii, is, a... 538 \n", "2000 [burbank, is, the, perfect, name, for, jim, ch... 901 \n", "\n", " no_stopwords no_stopwords_count \\\n", "1 [two, teen, couples, go, church, party, drink,... 306 \n", "2 [happy, quick, movie, review, bug, got, head, ... 119 \n", "3 [movies, like, make, jaded, movie, viewer, tha... 246 \n", "4 [quest, camelot, warner, bros, first, attempt,... 234 \n", "5 [mentally, unstable, man, undergoing, psychoth... 346 \n", "... ... ... \n", "1996 [movie, everything, movie, funny, dramatic, in... 355 \n", "1997 [gere, commanding, actor, always, great, films... 148 \n", "1998 [starring, matthew, broderick, denzel, washing... 561 \n", "1999 [second, epic, film, world, war, ii, unquestio... 287 \n", "2000 [burbank, perfect, name, jim, character, film,... 483 \n", "\n", " fdist bad tfidf_bad \n", "1 {'two': 2, 'teen': 4, 'couples': 1, 'go': 2, '... 2 0.840433 \n", "2 {'happy': 1, 'quick': 1, 'movie': 5, 'review':... 0 0.000000 \n", "3 {'movies': 1, 'like': 4, 'make': 2, 'jaded': 1... 0 0.000000 \n", "4 {'quest': 5, 'camelot': 4, 'warner': 1, 'bros'... 0 0.000000 \n", "5 {'mentally': 1, 'unstable': 1, 'man': 2, 'unde... 2 0.840433 \n", "... ... ... ... \n", "1996 {'movie': 14, 'everything': 2, 'funny': 5, 'dr... 0 0.000000 \n", "1997 {'gere': 1, 'commanding': 1, 'actor': 1, 'alwa... 0 0.000000 \n", "1998 {'starring': 1, 'matthew': 1, 'broderick': 2, ... 0 0.000000 \n", "1999 {'second': 1, 'epic': 2, 'film': 14, 'world': ... 0 0.000000 \n", "2000 {'burbank': 4, 'perfect': 4, 'name': 1, 'jim':... 0 0.000000 \n", "\n", "[2000 rows x 8 columns]" ] }, "execution_count": 57, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "760\n" ] } ], "source": [ "print((all_df['bad']!=0).sum())" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.4202164033831899" ] }, "execution_count": 59, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import math\n", "(math.log10(2000/760))" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/usr/local/lib/python3.7/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " \"\"\"Entry point for launching an IPython kernel.\n" ] } ], "source": [ "all_df['removed'] = all_df['tokenized_count'] - all_df['no_stopwords_count']" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "543035" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df['removed'].sum()" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1189601" ] }, "execution_count": 62, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df['tokenized_count'].sum()" ] }, { "cell_type": "code", "execution_count": 63, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.4564849895048844" ] }, "execution_count": 63, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df['removed'].sum()/all_df['tokenized_count'].sum()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }