{ "cells": [ { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "# import nltk\n", "# from nltk.tokenize import word_tokenize\n", "# from nltk.probability import FreqDist\n", "# file = open('WK2/moviereview_arff.arff')\n", "# tokens = []\n", "# for line in file:\n", "# # print(type(line))\n", "# # tokens.append(word_tokenize(line))\n", "# len(tokens)" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "import nltk\n", "from nltk.tokenize import word_tokenize\n", "from nltk.probability import FreqDist\n", "import pandas as pd\n", "file = open('WK2/moviereview.csv')\n", "all_df = pd.DataFrame(file)" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "# freq_dist_sent = []\n", "# for sent in tokenized_sentences[1:2]:\n", "# fdist = FreqDist(sent)\n", "# print(len(sent))\n", "# print(fdist['bad'])\n", "# # print(fdist.items())\n", "# # print(sent)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize\n", "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "all_df['tokenized'] = all_df.apply(lambda x: get_tokens(x[0]),axis=1)\n", "all_df['tokenized_count'] = all_df.apply(lambda x: len(x['tokenized']),axis=1)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "tokenized | \n", "tokenized_count | \n", "
---|---|---|---|
0 | \n", "text,reviewclass\\n | \n", "[text, reviewclass] | \n", "2 | \n", "
1 | \n", "'plot : two teen couples go to a church party ... | \n", "[two, teen, couples, go, to, a, church, party,... | \n", "638 | \n", "
2 | \n", "'the happy bastard\\'s quick movie review \\ndam... | \n", "[happy, quick, movie, review, that, bug, got, ... | \n", "215 | \n", "
3 | \n", "'it is movies like these that make a jaded mov... | \n", "[is, movies, like, these, that, make, a, jaded... | \n", "444 | \n", "
4 | \n", "' \\\" quest for camelot \\\" is warner bros . \\' ... | \n", "[quest, for, camelot, is, warner, bros, first,... | \n", "410 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "
1996 | \n", "'wow ! what a movie . \\nit\\'s everything a mov... | \n", "[what, a, movie, everything, a, movie, can, be... | \n", "702 | \n", "
1997 | \n", "'richard gere can be a commanding actor , but ... | \n", "[gere, can, be, a, commanding, actor, but, not... | \n", "286 | \n", "
1998 | \n", "'glory--starring matthew broderick , denzel wa... | \n", "[starring, matthew, broderick, denzel, washing... | \n", "990 | \n", "
1999 | \n", "'steven spielberg\\'s second epic film on world... | \n", "[second, epic, film, on, world, war, ii, is, a... | \n", "538 | \n", "
2000 | \n", "'truman ( \\\" true-man \\\" ) burbank is the perf... | \n", "[burbank, is, the, perfect, name, for, jim, ch... | \n", "901 | \n", "
2001 rows × 3 columns
\n", "\n", " | 0 | \n", "tokenized | \n", "tokenized_count | \n", "no_stopwords | \n", "no_stopwords_count | \n", "
---|---|---|---|---|---|
0 | \n", "text,reviewclass\\n | \n", "[text, reviewclass] | \n", "2 | \n", "[text, reviewclass] | \n", "2 | \n", "
1 | \n", "'plot : two teen couples go to a church party ... | \n", "[two, teen, couples, go, to, a, church, party,... | \n", "638 | \n", "[two, teen, couples, go, church, party, drink,... | \n", "306 | \n", "
2 | \n", "'the happy bastard\\'s quick movie review \\ndam... | \n", "[happy, quick, movie, review, that, bug, got, ... | \n", "215 | \n", "[happy, quick, movie, review, bug, got, head, ... | \n", "119 | \n", "
3 | \n", "'it is movies like these that make a jaded mov... | \n", "[is, movies, like, these, that, make, a, jaded... | \n", "444 | \n", "[movies, like, make, jaded, movie, viewer, tha... | \n", "246 | \n", "
4 | \n", "' \\\" quest for camelot \\\" is warner bros . \\' ... | \n", "[quest, for, camelot, is, warner, bros, first,... | \n", "410 | \n", "[quest, camelot, warner, bros, first, attempt,... | \n", "234 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1996 | \n", "'wow ! what a movie . \\nit\\'s everything a mov... | \n", "[what, a, movie, everything, a, movie, can, be... | \n", "702 | \n", "[movie, everything, movie, funny, dramatic, in... | \n", "355 | \n", "
1997 | \n", "'richard gere can be a commanding actor , but ... | \n", "[gere, can, be, a, commanding, actor, but, not... | \n", "286 | \n", "[gere, commanding, actor, always, great, films... | \n", "148 | \n", "
1998 | \n", "'glory--starring matthew broderick , denzel wa... | \n", "[starring, matthew, broderick, denzel, washing... | \n", "990 | \n", "[starring, matthew, broderick, denzel, washing... | \n", "561 | \n", "
1999 | \n", "'steven spielberg\\'s second epic film on world... | \n", "[second, epic, film, on, world, war, ii, is, a... | \n", "538 | \n", "[second, epic, film, world, war, ii, unquestio... | \n", "287 | \n", "
2000 | \n", "'truman ( \\\" true-man \\\" ) burbank is the perf... | \n", "[burbank, is, the, perfect, name, for, jim, ch... | \n", "901 | \n", "[burbank, perfect, name, jim, character, film,... | \n", "483 | \n", "
2001 rows × 5 columns
\n", "\n", " | 0 | \n", "tokenized | \n", "tokenized_count | \n", "no_stopwords | \n", "no_stopwords_count | \n", "fdist | \n", "bad | \n", "tfidf_bad | \n", "
---|---|---|---|---|---|---|---|---|
1 | \n", "'plot : two teen couples go to a church party ... | \n", "[two, teen, couples, go, to, a, church, party,... | \n", "638 | \n", "[two, teen, couples, go, church, party, drink,... | \n", "306 | \n", "{'two': 2, 'teen': 4, 'couples': 1, 'go': 2, '... | \n", "2 | \n", "0.840433 | \n", "
2 | \n", "'the happy bastard\\'s quick movie review \\ndam... | \n", "[happy, quick, movie, review, that, bug, got, ... | \n", "215 | \n", "[happy, quick, movie, review, bug, got, head, ... | \n", "119 | \n", "{'happy': 1, 'quick': 1, 'movie': 5, 'review':... | \n", "0 | \n", "0.000000 | \n", "
3 | \n", "'it is movies like these that make a jaded mov... | \n", "[is, movies, like, these, that, make, a, jaded... | \n", "444 | \n", "[movies, like, make, jaded, movie, viewer, tha... | \n", "246 | \n", "{'movies': 1, 'like': 4, 'make': 2, 'jaded': 1... | \n", "0 | \n", "0.000000 | \n", "
4 | \n", "' \\\" quest for camelot \\\" is warner bros . \\' ... | \n", "[quest, for, camelot, is, warner, bros, first,... | \n", "410 | \n", "[quest, camelot, warner, bros, first, attempt,... | \n", "234 | \n", "{'quest': 5, 'camelot': 4, 'warner': 1, 'bros'... | \n", "0 | \n", "0.000000 | \n", "
5 | \n", "'synopsis : a mentally unstable man undergoing... | \n", "[a, mentally, unstable, man, undergoing, psych... | \n", "658 | \n", "[mentally, unstable, man, undergoing, psychoth... | \n", "346 | \n", "{'mentally': 1, 'unstable': 1, 'man': 2, 'unde... | \n", "2 | \n", "0.840433 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
1996 | \n", "'wow ! what a movie . \\nit\\'s everything a mov... | \n", "[what, a, movie, everything, a, movie, can, be... | \n", "702 | \n", "[movie, everything, movie, funny, dramatic, in... | \n", "355 | \n", "{'movie': 14, 'everything': 2, 'funny': 5, 'dr... | \n", "0 | \n", "0.000000 | \n", "
1997 | \n", "'richard gere can be a commanding actor , but ... | \n", "[gere, can, be, a, commanding, actor, but, not... | \n", "286 | \n", "[gere, commanding, actor, always, great, films... | \n", "148 | \n", "{'gere': 1, 'commanding': 1, 'actor': 1, 'alwa... | \n", "0 | \n", "0.000000 | \n", "
1998 | \n", "'glory--starring matthew broderick , denzel wa... | \n", "[starring, matthew, broderick, denzel, washing... | \n", "990 | \n", "[starring, matthew, broderick, denzel, washing... | \n", "561 | \n", "{'starring': 1, 'matthew': 1, 'broderick': 2, ... | \n", "0 | \n", "0.000000 | \n", "
1999 | \n", "'steven spielberg\\'s second epic film on world... | \n", "[second, epic, film, on, world, war, ii, is, a... | \n", "538 | \n", "[second, epic, film, world, war, ii, unquestio... | \n", "287 | \n", "{'second': 1, 'epic': 2, 'film': 14, 'world': ... | \n", "0 | \n", "0.000000 | \n", "
2000 | \n", "'truman ( \\\" true-man \\\" ) burbank is the perf... | \n", "[burbank, is, the, perfect, name, for, jim, ch... | \n", "901 | \n", "[burbank, perfect, name, jim, character, film,... | \n", "483 | \n", "{'burbank': 4, 'perfect': 4, 'name': 1, 'jim':... | \n", "0 | \n", "0.000000 | \n", "
2000 rows × 8 columns
\n", "