{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW3 JOKER EXTREMES" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 1: Import Data" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import os\n", "def get_data_from_files(path):\n", " directory = os.listdir(path)\n", " results = []\n", " for file in directory:\n", " f=open(path+file)\n", " results.append(f.read())\n", " f.close()\n", " return results\n", "\n", "neg = get_data_from_files('../NEG_JK_E/')\n", "pos = get_data_from_files('../POS_JK_E/')" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Neg Reviews: 48\n", "Pos Reviews: 50\n" ] } ], "source": [ "print('Neg Reviews:', len(neg))\n", "print('Pos Reviews:', len(pos))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 2: Turn into DF & Label it" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "neg_df = pd.DataFrame(neg)\n", "pos_df = pd.DataFrame(pos)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Add labels" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "pos_df['PoN'] = 'P'\n", "neg_df['PoN'] = 'N'" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "all_df = neg_df.append(pos_df)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoN
0Everyone praised an overrated movie.\\nOverrat...N
1What idiotic FIlm\\nI can say that Phoenix is ...N
2Terrible\\nThe only thing good about this movi...N
3Watch Taxi Driver instead\\nThis is a poor att...N
4I learned one thing.\\nIt borrows a lot of ele...N
\n", "
" ], "text/plain": [ " 0 PoN\n", "0 Everyone praised an overrated movie.\\nOverrat... N\n", "1 What idiotic FIlm\\nI can say that Phoenix is ... N\n", "2 Terrible\\nThe only thing good about this movi... N\n", "3 Watch Taxi Driver instead\\nThis is a poor att... N\n", "4 I learned one thing.\\nIt borrows a lot of ele... N" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 3: Tokenize and Clean!" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import word_tokenize, sent_tokenize\n", "from nltk.sentiment import SentimentAnalyzer\n", "from nltk.sentiment.util import *" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n", "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNtokensnum_tokens
0Everyone praised an overrated movie.\\nOverrat...N[everyone, praised, an, overrated, movie, over...26
1What idiotic FIlm\\nI can say that Phoenix is ...N[what, idiotic, film, i, can, say, that, phoen...66
2Terrible\\nThe only thing good about this movi...N[terrible, the, only, thing, good, about, this...124
3Watch Taxi Driver instead\\nThis is a poor att...N[watch, taxi, driver, instead, this, is, a, po...123
4I learned one thing.\\nIt borrows a lot of ele...N[i, learned, one, thing, it, borrows, a, lot, ...70
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 Everyone praised an overrated movie.\\nOverrat... N \n", "1 What idiotic FIlm\\nI can say that Phoenix is ... N \n", "2 Terrible\\nThe only thing good about this movi... N \n", "3 Watch Taxi Driver instead\\nThis is a poor att... N \n", "4 I learned one thing.\\nIt borrows a lot of ele... N \n", "\n", " tokens num_tokens \n", "0 [everyone, praised, an, overrated, movie, over... 26 \n", "1 [what, idiotic, film, i, can, say, that, phoen... 66 \n", "2 [terrible, the, only, thing, good, about, this... 124 \n", "3 [watch, taxi, driver, instead, this, is, a, po... 123 \n", "4 [i, learned, one, thing, it, borrows, a, lot, ... 70 " ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 4: Create Bag of Words" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "from nltk.tokenize import casual_tokenize\n", "from collections import Counter\n", "# all_df['bow'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)\n", "all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0PoNtokensnum_tokensbow
0Everyone praised an overrated movie.\\nOverrat...N[everyone, praised, an, overrated, movie, over...26{'everyone': 1, 'praised': 1, 'an': 1, 'overra...
1What idiotic FIlm\\nI can say that Phoenix is ...N[what, idiotic, film, i, can, say, that, phoen...66{'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '...
2Terrible\\nThe only thing good about this movi...N[terrible, the, only, thing, good, about, this...124{'terrible': 3, 'the': 5, 'only': 1, 'thing': ...
3Watch Taxi Driver instead\\nThis is a poor att...N[watch, taxi, driver, instead, this, is, a, po...123{'watch': 1, 'taxi': 2, 'driver': 2, 'instead'...
4I learned one thing.\\nIt borrows a lot of ele...N[i, learned, one, thing, it, borrows, a, lot, ...70{'i': 1, 'learned': 1, 'one': 1, 'thing': 1, '...
\n", "
" ], "text/plain": [ " 0 PoN \\\n", "0 Everyone praised an overrated movie.\\nOverrat... N \n", "1 What idiotic FIlm\\nI can say that Phoenix is ... N \n", "2 Terrible\\nThe only thing good about this movi... N \n", "3 Watch Taxi Driver instead\\nThis is a poor att... N \n", "4 I learned one thing.\\nIt borrows a lot of ele... N \n", "\n", " tokens num_tokens \\\n", "0 [everyone, praised, an, overrated, movie, over... 26 \n", "1 [what, idiotic, film, i, can, say, that, phoen... 66 \n", "2 [terrible, the, only, thing, good, about, this... 124 \n", "3 [watch, taxi, driver, instead, this, is, a, po... 123 \n", "4 [i, learned, one, thing, it, borrows, a, lot, ... 70 \n", "\n", " bow \n", "0 {'everyone': 1, 'praised': 1, 'an': 1, 'overra... \n", "1 {'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '... \n", "2 {'terrible': 3, 'the': 5, 'only': 1, 'thing': ... \n", "3 {'watch': 1, 'taxi': 2, 'driver': 2, 'instead'... \n", "4 {'i': 1, 'learned': 1, 'one': 1, 'thing': 1, '... " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_df[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 5: Vectorize -- Create a Frequency Distribution Matrix" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
everyonepraisedanoverratedmovieofalltimethereviews...easyanswersquestionsraisesalbeitreinventionsourcematerialalikedisturbed
01112211111...0000000000
10000200020...0000000000
20001420050...0000000000
30000350090...0000000000
41010120010...0000000000
\n", "

5 rows × 2648 columns

\n", "
" ], "text/plain": [ " everyone praised an overrated movie of all time the reviews ... \\\n", "0 1 1 1 2 2 1 1 1 1 1 ... \n", "1 0 0 0 0 2 0 0 0 2 0 ... \n", "2 0 0 0 1 4 2 0 0 5 0 ... \n", "3 0 0 0 0 3 5 0 0 9 0 ... \n", "4 1 0 1 0 1 2 0 0 1 0 ... \n", "\n", " easy answers questions raises albeit reinvention source material \\\n", "0 0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 0 \n", "\n", " alike disturbed \n", "0 0 0 \n", "1 0 0 \n", "2 0 0 \n", "3 0 0 \n", "4 0 0 \n", "\n", "[5 rows x 2648 columns]" ] }, "execution_count": 25, "metadata": {}, "output_type": "execute_result" } ], "source": [ "freq_df = pd.DataFrame(all_df['bow'].tolist())\n", "freq_df = freq_df.fillna(0).astype(int)\n", "freq_df[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 6: Normalize" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With simple weights" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### With TFIDF" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfTransformer\n", "tfidf =TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)\n", "data =tfidf.fit_transform(freq_df.values)\n", "tfidf_reduced = pd.DataFrame(data.todense())" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0123456789...2638263926402641264226432644264526462647
00.2003220.3401280.1272480.5531000.1908960.0833100.1305530.1621560.0737240.221842...0.00.00.00.00.00.00.00.00.00.0
10.0000000.0000000.0000000.0000000.1123200.0000000.0000000.0000000.0867560.000000...0.00.00.00.00.00.00.00.00.00.0
20.0000000.0000000.0000000.1068060.1474510.0643490.0000000.0000000.1423630.000000...0.00.00.00.00.00.00.00.00.00.0
30.0000000.0000000.0000000.0000000.1092890.1589840.0000000.0000000.2532450.000000...0.00.00.00.00.00.00.00.00.00.0
40.1068880.0000000.0678970.0000000.0509290.0889050.0000000.0000000.0393380.000000...0.00.00.00.00.00.00.00.00.00.0
\n", "

5 rows × 2648 columns

\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 \\\n", "0 0.200322 0.340128 0.127248 0.553100 0.190896 0.083310 0.130553 \n", "1 0.000000 0.000000 0.000000 0.000000 0.112320 0.000000 0.000000 \n", "2 0.000000 0.000000 0.000000 0.106806 0.147451 0.064349 0.000000 \n", "3 0.000000 0.000000 0.000000 0.000000 0.109289 0.158984 0.000000 \n", "4 0.106888 0.000000 0.067897 0.000000 0.050929 0.088905 0.000000 \n", "\n", " 7 8 9 ... 2638 2639 2640 2641 2642 2643 \\\n", "0 0.162156 0.073724 0.221842 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "1 0.000000 0.086756 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "2 0.000000 0.142363 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "3 0.000000 0.253245 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "4 0.000000 0.039338 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n", "\n", " 2644 2645 2646 2647 \n", "0 0.0 0.0 0.0 0.0 \n", "1 0.0 0.0 0.0 0.0 \n", "2 0.0 0.0 0.0 0.0 \n", "3 0.0 0.0 0.0 0.0 \n", "4 0.0 0.0 0.0 0.0 \n", "\n", "[5 rows x 2648 columns]" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tfidf_reduced[:5]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## STEP 7: Test" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.naive_bayes import GaussianNB\n", "\n", "def get_NB(small_df, labels):\n", " x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n", "\n", " gnb = GaussianNB()\n", " gnb.fit(x_train, y_train)\n", " y_pred = gnb.predict(x_test)\n", " from sklearn import metrics\n", " print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.7666666666666667\n" ] } ], "source": [ "get_NB(tfidf_reduced, all_df['PoN'])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }