{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# SENTIMENT ANALYSIS -- (WITH HOMEMADE DATA!)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "(via [these docs](http://www.nltk.org/howto/sentiment.html))  |  10-06-19"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 1: Import ALL the things"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.classify import NaiveBayesClassifier\n",
    "from nltk.corpus import subjectivity\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 2: Import the fake data you created just for this project"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import pandas as pd\n",
    "negative = os.listdir('AI_NEG/')\n",
    "positive = os.listdir('AI_POS/')\n",
    "positive_alltext = []\n",
    "for file in positive:\n",
    "    f=open('AI_POS/'+file)\n",
    "    content=f.read()\n",
    "    positive_alltext.append(content)\n",
    "    f.close()\n",
    "\n",
    "negative_alltext = []\n",
    "for file in negative:\n",
    "    f=open('AI_NEG/'+file)\n",
    "    content=f.read()\n",
    "    negative_alltext.append(content)\n",
    "    f.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### STEP 2b: Tokenize and clean the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "from nltk.tokenize import word_tokenize\n",
    "\n",
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "negative_alltext_tokens = [get_tokens(sentence) for sentence in negative_alltext]\n",
    "positive_alltext_tokens = [get_tokens(sentence) for sentence in positive_alltext]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "neg_docs = [(sent, 'neg') for sent in negative_alltext_tokens]\n",
    "pos_docs = [(sent, 'pos') for sent in positive_alltext_tokens]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 3: Create `test` and `train` for both `subj` and `obj`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_neg_docs = neg_docs[:4]\n",
    "test_neg_docs = neg_docs[4:5]\n",
    "train_pos_docs = pos_docs[:4]\n",
    "test_pos_docs = pos_docs[4:5]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 4: Combine the two `test` and `train` sets "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_docs = train_neg_docs + train_pos_docs\n",
    "testing_docs = test_neg_docs + test_pos_docs"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 5: Use `SentimentAnalyzer` to mark negation in training docs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentim_analyzer = SentimentAnalyzer()\n",
    "all_words_neg = sentim_analyzer.all_words([mark_negation(doc) for doc in training_docs])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['where',\n",
       " 'are',\n",
       " 'the',\n",
       " 'jobs',\n",
       " 'oh',\n",
       " 'that',\n",
       " 'right',\n",
       " 'artificial',\n",
       " 'intelligence',\n",
       " 'took',\n",
       " 'our',\n",
       " 'jobs',\n",
       " 'how',\n",
       " 'can',\n",
       " 'we',\n",
       " 'trust',\n",
       " 'artificial',\n",
       " 'intelligence',\n",
       " 'to',\n",
       " 'drive',\n",
       " 'our',\n",
       " 'cars',\n",
       " 'when',\n",
       " 'they',\n",
       " 'ca',\n",
       " 'even',\n",
       " 'hack',\n",
       " 'a',\n",
       " 'captcha',\n",
       " 'i',\n",
       " 'hate',\n",
       " 'artificial',\n",
       " 'intelligence',\n",
       " 'my',\n",
       " 'dog',\n",
       " 'is',\n",
       " 'terrified',\n",
       " 'by',\n",
       " 'artificial',\n",
       " 'intelligence',\n",
       " 'my',\n",
       " 'dog',\n",
       " 'is',\n",
       " 'excited',\n",
       " 'by',\n",
       " 'the',\n",
       " 'advancements',\n",
       " 'in',\n",
       " 'artificial',\n",
       " 'intelligence',\n",
       " 'i',\n",
       " 'excited',\n",
       " 'for',\n",
       " 'my',\n",
       " 'child',\n",
       " 'to',\n",
       " 'grow',\n",
       " 'up',\n",
       " 'and',\n",
       " 'have',\n",
       " 'time',\n",
       " 'to',\n",
       " 'daydream',\n",
       " 'because',\n",
       " 'artificial',\n",
       " 'intelligence',\n",
       " 'has',\n",
       " 'taken',\n",
       " 'care',\n",
       " 'of',\n",
       " 'all',\n",
       " 'the',\n",
       " 'nitty',\n",
       " 'gritty',\n",
       " 'i',\n",
       " 'love',\n",
       " 'artificial',\n",
       " 'intelligence',\n",
       " 'order',\n",
       " 'my',\n",
       " 'groceries',\n",
       " 'pay',\n",
       " 'my',\n",
       " 'taxes',\n",
       " 'take',\n",
       " 'my',\n",
       " 'kids',\n",
       " 'to',\n",
       " 'school',\n",
       " 'yes',\n",
       " 'please',\n",
       " 'artificial',\n",
       " 'intelligence',\n",
       " 'has',\n",
       " 'given',\n",
       " 'me',\n",
       " 'my',\n",
       " 'life',\n",
       " 'back']"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_words_neg"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Note how this sentiment analyzer is SUPPOSED to mark everything after a negation word with '_NEG'\n",
    "However, we do not have enough data in our 10 text file dataset to actually run this successfully"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 6: Use `unigram_word_feats` to get unigrams features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "65"
      ]
     },
     "execution_count": 63,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unigram_feats = sentim_analyzer.unigram_word_feats(all_words_neg)\n",
    "len(unigram_feats)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 7: Use `add_feat_extractor` to get a feature-value representation of our data \n",
    "#### Apply to both `training_set` and `testing_set`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "sentim_analyzer.add_feat_extractor(extract_unigram_feats, unigrams=unigram_feats)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[({'contains(artificial)': True, 'contains(intelligence)': True, 'contains(my)': False, 'contains(to)': False, 'contains(the)': True, 'contains(i)': False, 'contains(jobs)': True, 'contains(our)': True, 'contains(dog)': False, 'contains(is)': False, 'contains(by)': False, 'contains(excited)': False, 'contains(has)': False, 'contains(where)': True, 'contains(are)': True, 'contains(oh)': True, 'contains(that)': True, 'contains(right)': True, 'contains(took)': True, 'contains(how)': False, 'contains(can)': False, 'contains(we)': False, 'contains(trust)': False, 'contains(drive)': False, 'contains(cars)': False, 'contains(when)': False, 'contains(they)': False, 'contains(ca)': False, 'contains(even)': False, 'contains(hack)': False, 'contains(a)': False, 'contains(captcha)': False, 'contains(hate)': False, 'contains(terrified)': False, 'contains(advancements)': False, 'contains(in)': False, 'contains(for)': False, 'contains(child)': False, 'contains(grow)': False, 'contains(up)': False, 'contains(and)': False, 'contains(have)': False, 'contains(time)': False, 'contains(daydream)': False, 'contains(because)': False, 'contains(taken)': False, 'contains(care)': False, 'contains(of)': False, 'contains(all)': False, 'contains(nitty)': False, 'contains(gritty)': False, 'contains(love)': False, 'contains(order)': False, 'contains(groceries)': False, 'contains(pay)': False, 'contains(taxes)': False, 'contains(take)': False, 'contains(kids)': False, 'contains(school)': False, 'contains(yes)': False, 'contains(please)': False, 'contains(given)': False, 'contains(me)': False, 'contains(life)': False, 'contains(back)': False}, 'neg')]"
      ]
     },
     "execution_count": 65,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "training_set = sentim_analyzer.apply_features(training_docs)\n",
    "training_set[:1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[({'contains(artificial)': True, 'contains(intelligence)': True, 'contains(my)': False, 'contains(to)': True, 'contains(the)': True, 'contains(i)': False, 'contains(jobs)': False, 'contains(our)': True, 'contains(dog)': False, 'contains(is)': True, 'contains(by)': False, 'contains(excited)': False, 'contains(has)': False, 'contains(where)': False, 'contains(are)': False, 'contains(oh)': False, 'contains(that)': False, 'contains(right)': False, 'contains(took)': False, 'contains(how)': False, 'contains(can)': False, 'contains(we)': False, 'contains(trust)': False, 'contains(drive)': False, 'contains(cars)': False, 'contains(when)': False, 'contains(they)': False, 'contains(ca)': False, 'contains(even)': False, 'contains(hack)': False, 'contains(a)': False, 'contains(captcha)': False, 'contains(hate)': False, 'contains(terrified)': False, 'contains(advancements)': False, 'contains(in)': False, 'contains(for)': False, 'contains(child)': False, 'contains(grow)': False, 'contains(up)': False, 'contains(and)': False, 'contains(have)': False, 'contains(time)': False, 'contains(daydream)': False, 'contains(because)': False, 'contains(taken)': False, 'contains(care)': False, 'contains(of)': True, 'contains(all)': False, 'contains(nitty)': False, 'contains(gritty)': False, 'contains(love)': False, 'contains(order)': False, 'contains(groceries)': False, 'contains(pay)': False, 'contains(taxes)': False, 'contains(take)': False, 'contains(kids)': False, 'contains(school)': False, 'contains(yes)': False, 'contains(please)': False, 'contains(given)': False, 'contains(me)': False, 'contains(life)': False, 'contains(back)': False}, 'neg')]"
      ]
     },
     "execution_count": 66,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test_set = sentim_analyzer.apply_features(testing_docs)\n",
    "test_set[:1]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### STEP 8: FINAL STEP!! We use Naive Bayes to create a trainer and FINALLY classify our data!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training classifier\n"
     ]
    }
   ],
   "source": [
    "trainer = NaiveBayesClassifier.train\n",
    "classifier = sentim_analyzer.train(trainer, training_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Evaluating NaiveBayesClassifier results...\n",
      "Accuracy: 1.0\n",
      "F-measure [neg]: 1.0\n",
      "F-measure [pos]: 1.0\n",
      "Precision [neg]: 1.0\n",
      "Precision [pos]: 1.0\n",
      "Recall [neg]: 1.0\n",
      "Recall [pos]: 1.0\n"
     ]
    }
   ],
   "source": [
    "for key,value in sorted(sentim_analyzer.evaluate(test_set).items()):\n",
    "    print('{0}: {1}'.format(key,value))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## CONCLUSION: We need more data"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
