{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# HW3 JOKER EXTREMES"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## STEP 1: Import Data"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"def get_data_from_files(path):\n",
" directory = os.listdir(path)\n",
" results = []\n",
" for file in directory:\n",
" f=open(path+file)\n",
" results.append(f.read())\n",
" f.close()\n",
" return results\n",
"\n",
"neg = get_data_from_files('../NEG_JK_E/')\n",
"pos = get_data_from_files('../POS_JK_E/')"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Neg Reviews: 48\n",
"Pos Reviews: 50\n"
]
}
],
"source": [
"print('Neg Reviews:', len(neg))\n",
"print('Pos Reviews:', len(pos))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## STEP 2: Turn into DF & Label it"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"neg_df = pd.DataFrame(neg)\n",
"pos_df = pd.DataFrame(pos)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Add labels"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"pos_df['PoN'] = 'P'\n",
"neg_df['PoN'] = 'N'"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"all_df = neg_df.append(pos_df)"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" PoN | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Everyone praised an overrated movie.\\nOverrat... | \n",
" N | \n",
"
\n",
" \n",
" 1 | \n",
" What idiotic FIlm\\nI can say that Phoenix is ... | \n",
" N | \n",
"
\n",
" \n",
" 2 | \n",
" Terrible\\nThe only thing good about this movi... | \n",
" N | \n",
"
\n",
" \n",
" 3 | \n",
" Watch Taxi Driver instead\\nThis is a poor att... | \n",
" N | \n",
"
\n",
" \n",
" 4 | \n",
" I learned one thing.\\nIt borrows a lot of ele... | \n",
" N | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 PoN\n",
"0 Everyone praised an overrated movie.\\nOverrat... N\n",
"1 What idiotic FIlm\\nI can say that Phoenix is ... N\n",
"2 Terrible\\nThe only thing good about this movi... N\n",
"3 Watch Taxi Driver instead\\nThis is a poor att... N\n",
"4 I learned one thing.\\nIt borrows a lot of ele... N"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## STEP 3: Tokenize and Clean!"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
"from nltk.sentiment import SentimentAnalyzer\n",
"from nltk.sentiment.util import *"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"def get_tokens(sentence):\n",
" tokens = word_tokenize(sentence)\n",
" clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
" return clean_tokens\n",
"\n",
"all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n",
"all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" PoN | \n",
" tokens | \n",
" num_tokens | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Everyone praised an overrated movie.\\nOverrat... | \n",
" N | \n",
" [everyone, praised, an, overrated, movie, over... | \n",
" 26 | \n",
"
\n",
" \n",
" 1 | \n",
" What idiotic FIlm\\nI can say that Phoenix is ... | \n",
" N | \n",
" [what, idiotic, film, i, can, say, that, phoen... | \n",
" 66 | \n",
"
\n",
" \n",
" 2 | \n",
" Terrible\\nThe only thing good about this movi... | \n",
" N | \n",
" [terrible, the, only, thing, good, about, this... | \n",
" 124 | \n",
"
\n",
" \n",
" 3 | \n",
" Watch Taxi Driver instead\\nThis is a poor att... | \n",
" N | \n",
" [watch, taxi, driver, instead, this, is, a, po... | \n",
" 123 | \n",
"
\n",
" \n",
" 4 | \n",
" I learned one thing.\\nIt borrows a lot of ele... | \n",
" N | \n",
" [i, learned, one, thing, it, borrows, a, lot, ... | \n",
" 70 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 PoN \\\n",
"0 Everyone praised an overrated movie.\\nOverrat... N \n",
"1 What idiotic FIlm\\nI can say that Phoenix is ... N \n",
"2 Terrible\\nThe only thing good about this movi... N \n",
"3 Watch Taxi Driver instead\\nThis is a poor att... N \n",
"4 I learned one thing.\\nIt borrows a lot of ele... N \n",
"\n",
" tokens num_tokens \n",
"0 [everyone, praised, an, overrated, movie, over... 26 \n",
"1 [what, idiotic, film, i, can, say, that, phoen... 66 \n",
"2 [terrible, the, only, thing, good, about, this... 124 \n",
"3 [watch, taxi, driver, instead, this, is, a, po... 123 \n",
"4 [i, learned, one, thing, it, borrows, a, lot, ... 70 "
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## STEP 4: Create Bag of Words"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"from nltk.tokenize import casual_tokenize\n",
"from collections import Counter\n",
"# all_df['bow'] = all_df.apply(lambda x: Counter(casual_tokenize(x[0])), axis=1)\n",
"all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" PoN | \n",
" tokens | \n",
" num_tokens | \n",
" bow | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" Everyone praised an overrated movie.\\nOverrat... | \n",
" N | \n",
" [everyone, praised, an, overrated, movie, over... | \n",
" 26 | \n",
" {'everyone': 1, 'praised': 1, 'an': 1, 'overra... | \n",
"
\n",
" \n",
" 1 | \n",
" What idiotic FIlm\\nI can say that Phoenix is ... | \n",
" N | \n",
" [what, idiotic, film, i, can, say, that, phoen... | \n",
" 66 | \n",
" {'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '... | \n",
"
\n",
" \n",
" 2 | \n",
" Terrible\\nThe only thing good about this movi... | \n",
" N | \n",
" [terrible, the, only, thing, good, about, this... | \n",
" 124 | \n",
" {'terrible': 3, 'the': 5, 'only': 1, 'thing': ... | \n",
"
\n",
" \n",
" 3 | \n",
" Watch Taxi Driver instead\\nThis is a poor att... | \n",
" N | \n",
" [watch, taxi, driver, instead, this, is, a, po... | \n",
" 123 | \n",
" {'watch': 1, 'taxi': 2, 'driver': 2, 'instead'... | \n",
"
\n",
" \n",
" 4 | \n",
" I learned one thing.\\nIt borrows a lot of ele... | \n",
" N | \n",
" [i, learned, one, thing, it, borrows, a, lot, ... | \n",
" 70 | \n",
" {'i': 1, 'learned': 1, 'one': 1, 'thing': 1, '... | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" 0 PoN \\\n",
"0 Everyone praised an overrated movie.\\nOverrat... N \n",
"1 What idiotic FIlm\\nI can say that Phoenix is ... N \n",
"2 Terrible\\nThe only thing good about this movi... N \n",
"3 Watch Taxi Driver instead\\nThis is a poor att... N \n",
"4 I learned one thing.\\nIt borrows a lot of ele... N \n",
"\n",
" tokens num_tokens \\\n",
"0 [everyone, praised, an, overrated, movie, over... 26 \n",
"1 [what, idiotic, film, i, can, say, that, phoen... 66 \n",
"2 [terrible, the, only, thing, good, about, this... 124 \n",
"3 [watch, taxi, driver, instead, this, is, a, po... 123 \n",
"4 [i, learned, one, thing, it, borrows, a, lot, ... 70 \n",
"\n",
" bow \n",
"0 {'everyone': 1, 'praised': 1, 'an': 1, 'overra... \n",
"1 {'what': 1, 'idiotic': 1, 'film': 1, 'i': 1, '... \n",
"2 {'terrible': 3, 'the': 5, 'only': 1, 'thing': ... \n",
"3 {'watch': 1, 'taxi': 2, 'driver': 2, 'instead'... \n",
"4 {'i': 1, 'learned': 1, 'one': 1, 'thing': 1, '... "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"all_df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## STEP 5: Vectorize -- Create a Frequency Distribution Matrix"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" everyone | \n",
" praised | \n",
" an | \n",
" overrated | \n",
" movie | \n",
" of | \n",
" all | \n",
" time | \n",
" the | \n",
" reviews | \n",
" ... | \n",
" easy | \n",
" answers | \n",
" questions | \n",
" raises | \n",
" albeit | \n",
" reinvention | \n",
" source | \n",
" material | \n",
" alike | \n",
" disturbed | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 2 | \n",
" 2 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" 1 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 2 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 4 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 5 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 3 | \n",
" 5 | \n",
" 0 | \n",
" 0 | \n",
" 9 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 0 | \n",
" 0 | \n",
" 1 | \n",
" 0 | \n",
" ... | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
" 0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 2648 columns
\n",
"
"
],
"text/plain": [
" everyone praised an overrated movie of all time the reviews ... \\\n",
"0 1 1 1 2 2 1 1 1 1 1 ... \n",
"1 0 0 0 0 2 0 0 0 2 0 ... \n",
"2 0 0 0 1 4 2 0 0 5 0 ... \n",
"3 0 0 0 0 3 5 0 0 9 0 ... \n",
"4 1 0 1 0 1 2 0 0 1 0 ... \n",
"\n",
" easy answers questions raises albeit reinvention source material \\\n",
"0 0 0 0 0 0 0 0 0 \n",
"1 0 0 0 0 0 0 0 0 \n",
"2 0 0 0 0 0 0 0 0 \n",
"3 0 0 0 0 0 0 0 0 \n",
"4 0 0 0 0 0 0 0 0 \n",
"\n",
" alike disturbed \n",
"0 0 0 \n",
"1 0 0 \n",
"2 0 0 \n",
"3 0 0 \n",
"4 0 0 \n",
"\n",
"[5 rows x 2648 columns]"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"freq_df = pd.DataFrame(all_df['bow'].tolist())\n",
"freq_df = freq_df.fillna(0).astype(int)\n",
"freq_df[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## STEP 6: Normalize"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### With simple weights"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### With TFIDF"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.feature_extraction.text import TfidfTransformer\n",
"tfidf =TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False)\n",
"data =tfidf.fit_transform(freq_df.values)\n",
"tfidf_reduced = pd.DataFrame(data.todense())"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" 0 | \n",
" 1 | \n",
" 2 | \n",
" 3 | \n",
" 4 | \n",
" 5 | \n",
" 6 | \n",
" 7 | \n",
" 8 | \n",
" 9 | \n",
" ... | \n",
" 2638 | \n",
" 2639 | \n",
" 2640 | \n",
" 2641 | \n",
" 2642 | \n",
" 2643 | \n",
" 2644 | \n",
" 2645 | \n",
" 2646 | \n",
" 2647 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0.200322 | \n",
" 0.340128 | \n",
" 0.127248 | \n",
" 0.553100 | \n",
" 0.190896 | \n",
" 0.083310 | \n",
" 0.130553 | \n",
" 0.162156 | \n",
" 0.073724 | \n",
" 0.221842 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.112320 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.086756 | \n",
" 0.000000 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.106806 | \n",
" 0.147451 | \n",
" 0.064349 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.142363 | \n",
" 0.000000 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.109289 | \n",
" 0.158984 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.253245 | \n",
" 0.000000 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 0.106888 | \n",
" 0.000000 | \n",
" 0.067897 | \n",
" 0.000000 | \n",
" 0.050929 | \n",
" 0.088905 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.039338 | \n",
" 0.000000 | \n",
" ... | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 2648 columns
\n",
"
"
],
"text/plain": [
" 0 1 2 3 4 5 6 \\\n",
"0 0.200322 0.340128 0.127248 0.553100 0.190896 0.083310 0.130553 \n",
"1 0.000000 0.000000 0.000000 0.000000 0.112320 0.000000 0.000000 \n",
"2 0.000000 0.000000 0.000000 0.106806 0.147451 0.064349 0.000000 \n",
"3 0.000000 0.000000 0.000000 0.000000 0.109289 0.158984 0.000000 \n",
"4 0.106888 0.000000 0.067897 0.000000 0.050929 0.088905 0.000000 \n",
"\n",
" 7 8 9 ... 2638 2639 2640 2641 2642 2643 \\\n",
"0 0.162156 0.073724 0.221842 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"1 0.000000 0.086756 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"2 0.000000 0.142363 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"3 0.000000 0.253245 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"4 0.000000 0.039338 0.000000 ... 0.0 0.0 0.0 0.0 0.0 0.0 \n",
"\n",
" 2644 2645 2646 2647 \n",
"0 0.0 0.0 0.0 0.0 \n",
"1 0.0 0.0 0.0 0.0 \n",
"2 0.0 0.0 0.0 0.0 \n",
"3 0.0 0.0 0.0 0.0 \n",
"4 0.0 0.0 0.0 0.0 \n",
"\n",
"[5 rows x 2648 columns]"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tfidf_reduced[:5]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## STEP 7: Test"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"from sklearn.model_selection import train_test_split\n",
"from sklearn.naive_bayes import GaussianNB\n",
"\n",
"def get_NB(small_df, labels):\n",
" x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n",
"\n",
" gnb = GaussianNB()\n",
" gnb.fit(x_train, y_train)\n",
" y_pred = gnb.predict(x_test)\n",
" from sklearn import metrics\n",
" print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Accuracy: 0.7666666666666667\n"
]
}
],
"source": [
"get_NB(tfidf_reduced, all_df['PoN'])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 2
}