{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Sentiment Analysis: TextBlob + Vader \n", "via [this tutorial](https://levelup.gitconnected.com/sentiment-analysis-using-machine-learning-python-9122e03f8f7b) |10-6-19" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [], "source": [ "from textblob import TextBlob\n", "from IPython.display import display, HTML\n", "import os\n", "import pandas as pd\n", "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n", "sid = SentimentIntensityAnalyzer()" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [], "source": [ "def get_data_from_files(path):\n", " directory = os.listdir(path)\n", " results = []\n", " for file in directory:\n", " f=open(path+file)\n", " results.append(f.read())\n", " f.close()\n", " return results\n", "\n", "neg_k = get_data_from_files('AI_NEG/')\n", "pos_k = get_data_from_files('AI_POS/')\n", "neg_a = get_data_from_files('NEG/')\n", "pos_a = get_data_from_files('POS/')\n", "neg_cornell = get_data_from_files('neg_cornell/')\n", "pos_cornell = get_data_from_files('pos_cornell/')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# TEXT BLOB" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "def get_pn(num):\n", " return 'neg' if num < 0 else 'pos'\n", "\n", "def get_sentiment(array, label):\n", " blobs = [[TextBlob(text), text] for text in array]\n", " return ([{'label': label,\n", " 'prediction': get_pn(obj.sentiment.polarity),\n", " 'sentiment': obj.sentiment.polarity,\n", " 'length': len(text), \n", " 'excerpt': text[:50]} for obj,text in blobs])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CASE STUDY 1: Kendra's Data" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelpredictionsentimentlengthexcerpt
0negneg-0.15714376WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...
1negneg-0.75000096How can we trust Artificial Intelligence to dr...
2negneg-0.77500031I hate artificial intelligence!
3negneg-0.75000047My dog is terrified by artificial intelligence!
4negneg-0.75000068Artificial intelligence is going to melt the b...
\n", "
" ], "text/plain": [ " label prediction sentiment length \\\n", "0 neg neg -0.157143 76 \n", "1 neg neg -0.750000 96 \n", "2 neg neg -0.775000 31 \n", "3 neg neg -0.750000 47 \n", "4 neg neg -0.750000 68 \n", "\n", " excerpt \n", "0 WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI... \n", "1 How can we trust Artificial Intelligence to dr... \n", "2 I hate artificial intelligence! \n", "3 My dog is terrified by artificial intelligence! \n", "4 Artificial intelligence is going to melt the b... " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelpredictionsentimentlengthexcerpt
0posneg-0.11250065My dog is excited by the advancements in artif...
1posneg-0.075000133I'm excited for my child to grow up and have t...
2posneg-0.12500031I love artificial intelligence!
3posneg-0.300000121Order my groceries, pay my taxes, take my kids...
4posneg-0.133333116I'm grateful every day that my child will like...
\n", "
" ], "text/plain": [ " label prediction sentiment length \\\n", "0 pos neg -0.112500 65 \n", "1 pos neg -0.075000 133 \n", "2 pos neg -0.125000 31 \n", "3 pos neg -0.300000 121 \n", "4 pos neg -0.133333 116 \n", "\n", " excerpt \n", "0 My dog is excited by the advancements in artif... \n", "1 I'm excited for my child to grow up and have t... \n", "2 I love artificial intelligence! \n", "3 Order my groceries, pay my taxes, take my kids... \n", "4 I'm grateful every day that my child will like... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(pd.DataFrame(get_sentiment(neg_k, 'neg')))\n", "display(pd.DataFrame(get_sentiment(pos_k, 'pos')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CASE STUDY 2: Ami's Data" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelpredictionsentimentlengthexcerpt
0negneg-0.0545773554that's exactly how long the movie felt to me ....
1negpos0.0254672929\" quest for camelot \" is warner bros . ' firs...
2negpos0.0033343365so ask yourself what \" 8mm \" ( \" eight millime...
3negpos0.0229254418synopsis : a mentally unstable man undergoing ...
4negpos0.0432343911capsule : in 2176 on the planet mars police ta...
\n", "
" ], "text/plain": [ " label prediction sentiment length \\\n", "0 neg neg -0.054577 3554 \n", "1 neg pos 0.025467 2929 \n", "2 neg pos 0.003334 3365 \n", "3 neg pos 0.022925 4418 \n", "4 neg pos 0.043234 3911 \n", "\n", " excerpt \n", "0 that's exactly how long the movie felt to me .... \n", "1 \" quest for camelot \" is warner bros . ' firs... \n", "2 so ask yourself what \" 8mm \" ( \" eight millime... \n", "3 synopsis : a mentally unstable man undergoing ... \n", "4 capsule : in 2176 on the planet mars police ta... " ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelpredictionsentimentlengthexcerpt
0pospos0.0236634227films adapted from comic books have had plenty...
1pospos0.1310922421you've got mail works alot better than it dese...
2pospos0.1106266092\" jaws \" is a rare film that grabs your atten...
3pospos0.1038474096every now and then a movie comes along from a ...
4posneg-0.0701513898moviemaking is a lot like being the general ma...
\n", "
" ], "text/plain": [ " label prediction sentiment length \\\n", "0 pos pos 0.023663 4227 \n", "1 pos pos 0.131092 2421 \n", "2 pos pos 0.110626 6092 \n", "3 pos pos 0.103847 4096 \n", "4 pos neg -0.070151 3898 \n", "\n", " excerpt \n", "0 films adapted from comic books have had plenty... \n", "1 you've got mail works alot better than it dese... \n", "2 \" jaws \" is a rare film that grabs your atten... \n", "3 every now and then a movie comes along from a ... \n", "4 moviemaking is a lot like being the general ma... " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(pd.DataFrame(get_sentiment(neg_a, 'neg')))\n", "display(pd.DataFrame(get_sentiment(pos_a, 'pos')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CASE STUDY 3: Cornell Data" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "229\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelpredictionsentimentlengthexcerptaccurate
0negpos0.0262405953bad . bad . \\nbad . \\nthat one word seems to p...no
1negpos0.0760403396isn't it the ultimate sign of a movie's cinema...no
2negneg-0.1287332762\" gordy \" is not a movie , it is a 90-minute-...yes
3negneg-0.0004853840disconnect the phone line . \\ndon't accept the...yes
4negpos0.1227702270when robert forster found himself famous again...no
.....................
995negpos0.1454891945synopsis : when a meteorite crashlands in the ...no
996negpos0.1027233116it's now the anniversary of the slayings of ju...no
997negpos0.0424731755coinciding with the emerging popularity of mov...no
998negneg-0.0486562826and now the high-flying hong kong style of fil...yes
999negneg-0.0906554165battlefield long , boring and just plain stupi...yes
\n", "

1000 rows × 6 columns

\n", "
" ], "text/plain": [ " label prediction sentiment length \\\n", "0 neg pos 0.026240 5953 \n", "1 neg pos 0.076040 3396 \n", "2 neg neg -0.128733 2762 \n", "3 neg neg -0.000485 3840 \n", "4 neg pos 0.122770 2270 \n", ".. ... ... ... ... \n", "995 neg pos 0.145489 1945 \n", "996 neg pos 0.102723 3116 \n", "997 neg pos 0.042473 1755 \n", "998 neg neg -0.048656 2826 \n", "999 neg neg -0.090655 4165 \n", "\n", " excerpt accurate \n", "0 bad . bad . \\nbad . \\nthat one word seems to p... no \n", "1 isn't it the ultimate sign of a movie's cinema... no \n", "2 \" gordy \" is not a movie , it is a 90-minute-... yes \n", "3 disconnect the phone line . \\ndon't accept the... yes \n", "4 when robert forster found himself famous again... no \n", ".. ... ... \n", "995 synopsis : when a meteorite crashlands in the ... no \n", "996 it's now the anniversary of the slayings of ju... no \n", "997 coinciding with the emerging popularity of mov... no \n", "998 and now the high-flying hong kong style of fil... yes \n", "999 battlefield long , boring and just plain stupi... yes \n", "\n", "[1000 rows x 6 columns]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "971\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelpredictionsentimentlengthexcerptaccurate
0pospos0.2211734662assume nothing . \\nthe phrase is perhaps one o...yes
1pospos0.0897363839plot : derek zoolander is a male model . \\nhe ...yes
2pospos0.2067439380i actually am a fan of the original 1961 or so...yes
3pospos0.1419052407a movie that's been as highly built up as the ...yes
4pospos0.1763321840\" good will hunting \" is two movies in one : ...yes
.....................
995pospos0.0728152658one of the funniest carry on movies and the th...yes
996pospos0.1028794196i remember making a pact , right after `patch ...yes
997pospos0.1950972094barely scrapping by playing at a nyc piano bar...yes
998pospos0.1175304575if the current trends of hollywood filmmaking ...yes
999posneg-0.0135693870capsule : the director of cure brings a weird ...no
\n", "

1000 rows × 6 columns

\n", "
" ], "text/plain": [ " label prediction sentiment length \\\n", "0 pos pos 0.221173 4662 \n", "1 pos pos 0.089736 3839 \n", "2 pos pos 0.206743 9380 \n", "3 pos pos 0.141905 2407 \n", "4 pos pos 0.176332 1840 \n", ".. ... ... ... ... \n", "995 pos pos 0.072815 2658 \n", "996 pos pos 0.102879 4196 \n", "997 pos pos 0.195097 2094 \n", "998 pos pos 0.117530 4575 \n", "999 pos neg -0.013569 3870 \n", "\n", " excerpt accurate \n", "0 assume nothing . \\nthe phrase is perhaps one o... yes \n", "1 plot : derek zoolander is a male model . \\nhe ... yes \n", "2 i actually am a fan of the original 1961 or so... yes \n", "3 a movie that's been as highly built up as the ... yes \n", "4 \" good will hunting \" is two movies in one : ... yes \n", ".. ... ... \n", "995 one of the funniest carry on movies and the th... yes \n", "996 i remember making a pact , right after `patch ... yes \n", "997 barely scrapping by playing at a nyc piano bar... yes \n", "998 if the current trends of hollywood filmmaking ... yes \n", "999 capsule : the director of cure brings a weird ... no \n", "\n", "[1000 rows x 6 columns]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_n = pd.DataFrame(get_sentiment(neg_cornell, 'neg'))\n", "df_p = pd.DataFrame(get_sentiment(pos_cornell, 'pos'))\n", "\n", "import numpy as np\n", "df_n['accurate'] = np.where(df_n['label'] == df_n['prediction'], 'yes', 'no')\n", "df_p['accurate'] = np.where(df_p['label'] == df_p['prediction'], 'yes', 'no')\n", "\n", "print((df_n['accurate']=='yes').sum())\n", "display(df_n)\n", "\n", "print((df_p['accurate']=='yes').sum())\n", "display(df_p)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# VADER" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "def get_vader_scores(array, label):\n", " vader_array = []\n", " for sentence in array:\n", " ss = sid.polarity_scores(sentence)\n", " vader_array.append({'label': label, 'compound': ss['compound'], 'excerpt': sentence[:50]})\n", " return vader_array" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[\"WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICIAL INTELLIGENCE TOOK OUR JOBS.\",\n", " \"How can we trust Artificial Intelligence to drive our cars when they can't even hack a captcha?!\",\n", " 'I hate artificial intelligence!',\n", " 'My dog is terrified by artificial intelligence!',\n", " 'Artificial intelligence is going to melt the brains of our children!']" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "neg_k" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CASE STUDY 1: Kendra's Data" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelcompoundexcerpt
0neg0.5255WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...
1neg0.7712How can we trust Artificial Intelligence to dr...
2neg-0.2244I hate artificial intelligence!
3neg-0.2942My dog is terrified by artificial intelligence!
4neg0.5255Artificial intelligence is going to melt the b...
\n", "
" ], "text/plain": [ " label compound excerpt\n", "0 neg 0.5255 WHERE ARE THE JOBS?! OH THAT'S RIGHT. ARTIFICI...\n", "1 neg 0.7712 How can we trust Artificial Intelligence to dr...\n", "2 neg -0.2244 I hate artificial intelligence!\n", "3 neg -0.2942 My dog is terrified by artificial intelligence!\n", "4 neg 0.5255 Artificial intelligence is going to melt the b..." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelcompoundexcerpt
0pos0.6705My dog is excited by the advancements in artif...
1pos0.8271I'm excited for my child to grow up and have t...
2pos0.8221I love artificial intelligence!
3pos0.8213Order my groceries, pay my taxes, take my kids...
4pos0.8402I'm grateful every day that my child will like...
\n", "
" ], "text/plain": [ " label compound excerpt\n", "0 pos 0.6705 My dog is excited by the advancements in artif...\n", "1 pos 0.8271 I'm excited for my child to grow up and have t...\n", "2 pos 0.8221 I love artificial intelligence!\n", "3 pos 0.8213 Order my groceries, pay my taxes, take my kids...\n", "4 pos 0.8402 I'm grateful every day that my child will like..." ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(pd.DataFrame(get_vader_scores(neg_k, 'neg')))\n", "display(pd.DataFrame(get_vader_scores(pos_k, 'pos')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CASE STUDY 2: Ami's Data" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelcompoundexcerpt
0neg0.7836that's exactly how long the movie felt to me ....
1neg-0.8481\" quest for camelot \" is warner bros . ' firs...
2neg-0.9753so ask yourself what \" 8mm \" ( \" eight millime...
3neg0.6824synopsis : a mentally unstable man undergoing ...
4neg-0.9879capsule : in 2176 on the planet mars police ta...
\n", "
" ], "text/plain": [ " label compound excerpt\n", "0 neg 0.7836 that's exactly how long the movie felt to me ....\n", "1 neg -0.8481 \" quest for camelot \" is warner bros . ' firs...\n", "2 neg -0.9753 so ask yourself what \" 8mm \" ( \" eight millime...\n", "3 neg 0.6824 synopsis : a mentally unstable man undergoing ...\n", "4 neg -0.9879 capsule : in 2176 on the planet mars police ta..." ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelcompoundexcerpt
0pos-0.5887films adapted from comic books have had plenty...
1pos0.9964you've got mail works alot better than it dese...
2pos0.9868\" jaws \" is a rare film that grabs your atten...
3pos0.8825every now and then a movie comes along from a ...
4pos-0.3525moviemaking is a lot like being the general ma...
\n", "
" ], "text/plain": [ " label compound excerpt\n", "0 pos -0.5887 films adapted from comic books have had plenty...\n", "1 pos 0.9964 you've got mail works alot better than it dese...\n", "2 pos 0.9868 \" jaws \" is a rare film that grabs your atten...\n", "3 pos 0.8825 every now and then a movie comes along from a ...\n", "4 pos -0.3525 moviemaking is a lot like being the general ma..." ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "display(pd.DataFrame(get_vader_scores(neg_a, 'neg')))\n", "display(pd.DataFrame(get_vader_scores(pos_a, 'pos')))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# NLTK with NaiveBayes" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "from nltk.classify import NaiveBayesClassifier\n", "from nltk.tokenize import word_tokenize\n", "from nltk.sentiment import SentimentAnalyzer\n", "from nltk.sentiment.util import *\n", "\n", "def get_tokens(sentence):\n", " tokens = word_tokenize(sentence)\n", " clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n", " return clean_tokens\n", "\n", "def get_nltk_train_test(array, label, num_train):\n", " tokens = [get_tokens(sentence) for sentence in array]\n", " docs = [(sent, label) for sent in tokens]\n", " train_docs = docs[:num_train]\n", " test_docs = docs[num_train:len(array)]\n", " return [train_docs, test_docs]\n", "\n", "\n", "def get_nltk_NB(NEG_DATA, POS_DATA, num_train):\n", " train_neg, test_neg = get_nltk_train_test(NEG_DATA, 'neg', num_train)\n", " train_pos, test_pos = get_nltk_train_test(POS_DATA, 'pos', num_train)\n", "\n", " training_docs = train_neg + train_pos\n", " testing_docs = test_neg + test_pos\n", "\n", " sentim_analyzer = SentimentAnalyzer()\n", " all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])\n", " unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)\n", " sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)\n", " training_set = sentim_analyzer.apply_features(training_docs)\n", " test_set = sentim_analyzer.apply_features(testing_docs)\n", "\n", " trainer = NaiveBayesClassifier.train\n", " classifier = sentim_analyzer.train(trainer, training_set)\n", " \n", " results = []\n", " for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):\n", " print('{0}: {1}'.format(key,value))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CASE STUDY 1: Kendra's Data" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training classifier\n", "Evaluating NaiveBayesClassifier results...\n", "Accuracy: 1.0\n", "F-measure [neg]: 1.0\n", "F-measure [pos]: 1.0\n", "Precision [neg]: 1.0\n", "Precision [pos]: 1.0\n", "Recall [neg]: 1.0\n", "Recall [pos]: 1.0\n" ] } ], "source": [ "get_nltk_NB(neg_k, pos_k, 4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CASE STUDY 2: Ami's Data" ] }, { "cell_type": "code", "execution_count": 51, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training classifier\n", "Evaluating NaiveBayesClassifier results...\n", "Accuracy: 0.5\n", "F-measure [neg]: 0.6666666666666666\n", "F-measure [pos]: None\n", "Precision [neg]: 0.5\n", "Precision [pos]: None\n", "Recall [neg]: 1.0\n", "Recall [pos]: 0.0\n" ] } ], "source": [ "get_nltk_NB(neg_a, pos_a, 4)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## CASE STUDY 3: Cornell's Data" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Training classifier\n", "Evaluating NaiveBayesClassifier results...\n", "Accuracy: 0.8125\n", "F-measure [neg]: 0.8259860788863109\n", "F-measure [pos]: 0.7967479674796748\n", "Precision [neg]: 0.7705627705627706\n", "Precision [pos]: 0.8698224852071006\n", "Recall [neg]: 0.89\n", "Recall [pos]: 0.735\n" ] } ], "source": [ "get_nltk_NB(neg_cornell, pos_cornell, 800)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 2 }