{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW4 PIPELINE + HW6 + HW7 + Final Project\n",
    "## Building off HW2 + HW3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "## =======================================================\n",
    "## IMPORTING\n",
    "## =======================================================\n",
    "import os\n",
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file)\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "## =======================================================\n",
    "## TOKENIZING\n",
    "## =======================================================\n",
    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "def get_sentence_tokens(review):\n",
    "    return sent_tokenize(review)\n",
    "\n",
    "## =======================================================\n",
    "## REMOVING STOPWORDS\n",
    "## =======================================================\n",
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "def remove_stopwords(sentence):\n",
    "    filtered_text = []\n",
    "    for word in sentence:\n",
    "        if word not in stop_words:\n",
    "            filtered_text.append(word)\n",
    "    return filtered_text\n",
    "\n",
    "## =======================================================\n",
    "## FREQUENCY DISTRIBUTIONS\n",
    "## =======================================================\n",
    "from nltk.probability import FreqDist\n",
    "def get_most_common(tokens):\n",
    "    fdist = FreqDist(tokens)\n",
    "    return fdist.most_common(12)\n",
    "\n",
    "def get_most_common(tokens):\n",
    "    fdist = FreqDist(tokens)\n",
    "    return fdist.most_common(12)\n",
    "\n",
    "def get_fdist(tokens):\n",
    "    return (FreqDist(tokens))\n",
    "\n",
    "## =======================================================\n",
    "## SENTIMENT ANALYSIS\n",
    "## =======================================================\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "sid = SentimentIntensityAnalyzer()\n",
    "\n",
    "def get_vader_score(review):\n",
    "    return sid.polarity_scores(review)\n",
    "\n",
    "def separate_vader_score(vader_score, key):\n",
    "    return vader_score[key]\n",
    "\n",
    "## =======================================================\n",
    "## SUMMARIZER\n",
    "## =======================================================\n",
    "def get_weighted_freq_dist(review, freq_dist):\n",
    "    try:\n",
    "        max_freq = max(freq_dist.values())\n",
    "        for word in freq_dist.keys():\n",
    "            freq_dist[word] = (freq_dist[word]/max_freq)\n",
    "        return freq_dist\n",
    "    except:\n",
    "        for word in freq_dist.keys():\n",
    "            freq_dist[word] = (freq_dist[word]/1)\n",
    "        return freq_dist\n",
    "        \n",
    "\n",
    "def get_sentence_score(review, freq_dist):\n",
    "    sentence_scores = {}\n",
    "    for sent in review:\n",
    "        for word in nltk.word_tokenize(sent.lower()):\n",
    "            if word in freq_dist.keys():\n",
    "                if len(sent.split(' ')) < 30:\n",
    "                    if sent not in sentence_scores.keys():\n",
    "                        sentence_scores[sent] = freq_dist[word]\n",
    "                    else:\n",
    "                        sentence_scores[sent] += freq_dist[word]\n",
    "    return sentence_scores\n",
    "\n",
    "def get_summary_sentences(sentence_scores):\n",
    "    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)\n",
    "    return ''.join(sent[0] for sent in sorted_sentences[:5])\n",
    "\n",
    "def get_freq_words(freq_dist):\n",
    "    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)\n",
    "    return ' '.join(word[0] for word in sorted_words[:50])\n",
    "\n",
    "## =======================================================\n",
    "## MACHINE LEARNING -- NAIVE BAYES\n",
    "## =======================================================\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import confusion_matrix, classification_report\n",
    "\n",
    "# def get_NB(small_df, labels):\n",
    "#     x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n",
    "\n",
    "#     gnb = GaussianNB()\n",
    "#     gnb.fit(x_train, y_train)\n",
    "#     y_pred = gnb.predict(x_test)\n",
    "#     print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))\n",
    "    \n",
    "    \n",
    "def get_NB(small_df, labels, classifier, title):\n",
    "    seeds = [109, 210, 420, 19, 7]\n",
    "    dfs = []\n",
    "    overall = []\n",
    "    print(title)\n",
    "    for seed in seeds:\n",
    "        x_train, x_test, y_train, y_test = train_test_split(small_df.values, \n",
    "                                                            labels, test_size=0.3, random_state = seed)\n",
    "        gnb = classifier\n",
    "        gnb.fit(x_train, y_train).score(x_train, y_train)\n",
    "        y_pred = gnb.predict(x_test)\n",
    "        accuracy =  metrics.accuracy_score(y_test, y_pred)\n",
    "        report = metrics.classification_report(y_test, y_pred)\n",
    "        print(\"Accuracy:\", accuracy)\n",
    "#         print(report)\n",
    "        overall.append(accuracy)\n",
    "        cm = confusion_matrix(y_test, y_pred)\n",
    "        # confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\n",
    "#         tn, fp, fn, tp = cm.ravel()\n",
    "        df = pd.DataFrame(cm.ravel())\n",
    "        dfs.append(df)\n",
    "    print('AVERAGE ACCURACY:', sum(overall)/len(overall))\n",
    "    return dfs\n",
    "\n",
    "\n",
    "def display_NB_tables(dfs):\n",
    "    for df in dfs:\n",
    "        print(display(df))\n",
    "        \n",
    "## =======================================================\n",
    "## PLOTS\n",
    "## =======================================================        \n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt \n",
    "def bar_plot(df, title): \n",
    "    graph = sns.barplot(y = \"count\", x = \"word\", data = df, palette = \"husl\")\n",
    "    plt.title(title)\n",
    "    plt.xlabel(\"Word\")\n",
    "    plt.ylabel(\"Count\")\n",
    "    sns.set_context(\"talk\")\n",
    "    plt.xticks(rotation = 90)\n",
    "    return plt\n",
    "\n",
    "from nltk.tokenize import casual_tokenize\n",
    "from collections import Counter\n",
    "\n",
    "\n",
    "## =======================================================\n",
    "## CLEANERS\n",
    "## =======================================================   \n",
    "import re, string\n",
    "def diy_cleaner(review):\n",
    "    try:\n",
    "        both = review.split('\\n')\n",
    "        title = both[0]\n",
    "        review = both[1]\n",
    "        review = review.replace(\"'\",\"\")\n",
    "    except:\n",
    "        review = review.replace(\"'\",\"\")\n",
    "    pattern = re.compile('[\\W_]+')\n",
    "    review = pattern.sub(' ', review)\n",
    "    cleaned = title + ' ' + title + ' ' + review\n",
    "    return cleaned.lower()\n",
    "\n",
    "def pruner(review):\n",
    "    clean_review = ' '.join([word for word in review.split() if len(word) > 3])\n",
    "    return clean_review\n",
    "\n",
    "sentim_analyzer = SentimentAnalyzer()\n",
    "def get_nltk_negs(tokens):\n",
    "    all_words_neg = sentim_analyzer.all_words([mark_negation(tokens)])\n",
    "    return all_words_neg\n",
    "\n",
    "def get_unigram_feats(neg_tokens):\n",
    "    unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)\n",
    "    return unigram_feats\n",
    "    \n",
    "def get_bigram_feats(tokens):\n",
    "    ngrams = zip(*[tokens[i:] for i in range(2)])\n",
    "    return [\"_\".join(ngram) for ngram in ngrams]\n",
    "\n",
    "## =======================================================\n",
    "## HELPERS\n",
    "## =======================================================  \n",
    "def get_bow_from_column(df, column):\n",
    "    all_column_data = ' '.join(df[column].tolist())\n",
    "    all_column_fd = Counter(all_column_data.split())\n",
    "    return all_column_fd\n",
    "\n",
    "def get_common_words(num):\n",
    "    most_common_neg = [word[0] for word in big_bow_n.most_common(num)]\n",
    "    most_common_pos = [word[0] for word in big_bow_p.most_common(num)]\n",
    "    in_both = np.intersect1d(most_common_neg, most_common_pos)\n",
    "    neg_notpos = np.setdiff1d(most_common_neg, most_common_pos)\n",
    "    pos_notneg = np.setdiff1d(most_common_pos, most_common_neg)\n",
    "    return [len(in_both), len(neg_notpos), len(pos_notneg), len(in_both)/num, in_both, neg_notpos, pos_notneg]\n",
    "\n",
    "def get_only_polarized(tokens, common_words):\n",
    "    return [token for token in tokens if token not in common_words[4]] # 70\n",
    "\n",
    "## =======================================================\n",
    "## VISUALS\n",
    "## =======================================================  \n",
    "import wordcloud\n",
    "from wordcloud import WordCloud, ImageColorGenerator\n",
    "from PIL import Image\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt \n",
    "def create_word_cloud_with_mask(path_of_mask_image, dictionary, \n",
    "                                max_num_words, title):\n",
    "        mask = np.array(Image.open(path_of_mask_image))\n",
    "        word_cloud = WordCloud(background_color = \"white\", \n",
    "                               max_words = max_num_words, \n",
    "                               mask = mask, max_font_size = 125, \n",
    "                               random_state = 1006)\n",
    "        word_cloud.generate_from_frequencies(dictionary)\n",
    "        image_colors = ImageColorGenerator(mask)\n",
    "        plt.figure(figsize = [8,8])\n",
    "        plt.imshow(word_cloud.recolor(color_func = image_colors), interpolation = \"bilinear\")\n",
    "        plt.title(title)\n",
    "        sns.set_context(\"poster\")\n",
    "        plt.axis(\"off\")\n",
    "        return plt\n",
    "    \n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt \n",
    "def bar_plot(df, title): \n",
    "    graph = sns.barplot(y = \"count\", x = \"word\", data = df, palette = \"husl\")\n",
    "    plt.title(title)\n",
    "    plt.xlabel(\"Word\")\n",
    "    plt.ylabel(\"Count\")\n",
    "    sns.set_context(\"talk\")\n",
    "    plt.xticks(rotation = 90)\n",
    "    return plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>labels</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>yeah first_person_pronoun want to address the ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>umm pamela can pronoun hear first_person_prono...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>its on september th kayla and david first_pers...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>hi ladies first_person_pronoun wanted to tell ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>lord forgive pronoun pronoun dont know what pr...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>561</td>\n",
       "      <td>i pray that first_person_pronoun family will r...</td>\n",
       "      <td>unknown</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>562</td>\n",
       "      <td>when asked if pronoun had a last statement pro...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>563</td>\n",
       "      <td>what is about to transpire in a few moments is...</td>\n",
       "      <td>no</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>564</td>\n",
       "      <td>none</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>565</td>\n",
       "      <td>statement to the media first_person_pronoun at...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>566 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                     0   labels\n",
       "0    yeah first_person_pronoun want to address the ...      yes\n",
       "1    umm pamela can pronoun hear first_person_prono...      yes\n",
       "2    its on september th kayla and david first_pers...      yes\n",
       "3    hi ladies first_person_pronoun wanted to tell ...      yes\n",
       "4    lord forgive pronoun pronoun dont know what pr...      yes\n",
       "..                                                 ...      ...\n",
       "561  i pray that first_person_pronoun family will r...  unknown\n",
       "562  when asked if pronoun had a last statement pro...      yes\n",
       "563  what is about to transpire in a few moments is...       no\n",
       "564                                               none      yes\n",
       "565  statement to the media first_person_pronoun at...      yes\n",
       "\n",
       "[566 rows x 2 columns]"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "\n",
    "# train=pd.read_csv(\"../WK7/kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "# y=train['Sentiment'].values\n",
    "# X=train['Phrase'].values\n",
    "\n",
    "# all_df = pd.DataFrame(X)\n",
    "# all_df['labels'] = y\n",
    "# all_df\n",
    "\n",
    "df = pd.read_csv('../death_row_discritized.csv')\n",
    "\n",
    "def to_string(tokens):\n",
    "    try:\n",
    "        return \" \".join(eval(tokens))\n",
    "    except:\n",
    "        return \"error\"\n",
    "    \n",
    "df['statement_string'] = df.apply(lambda x: to_string(x['last_statement']), axis=1)\n",
    "# y=df['vic_kid'].values\n",
    "y=df['prior_record'].values\n",
    "y_labels = list(set(y))\n",
    "X=df['statement_string'].values\n",
    "\n",
    "all_df = pd.DataFrame(X)\n",
    "all_df['labels'] = y\n",
    "all_df\n",
    "\n",
    "# neg_df = pd.DataFrame(neg)\n",
    "# pos_df = pd.DataFrame(pos)\n",
    "\n",
    "# pos_df['PoN'] = 'P'\n",
    "# neg_df['PoN'] = 'N'\n",
    "# all_df = neg_df.append(pos_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "# neg = get_data_from_files('../NEG_JK/')\n",
    "# pos = get_data_from_files('../POS_JK/')\n",
    "\n",
    "# neg = get_data_from_files('../neg_cornell/')\n",
    "# pos = get_data_from_files('../pos_cornell/')\n",
    "\n",
    "# neg = get_data_from_files('../neg_hw4/')\n",
    "# pos = get_data_from_files('../pos_hw4/')\n",
    "\n",
    "# neg = get_data_from_files('../hw4_lie_false/')\n",
    "# pos = get_data_from_files('../hw4_lie_true/')\n",
    "\n",
    "# pos = get_data_from_files('../hw4_lie_false/')\n",
    "# neg = get_data_from_files('../hw4_lie_true/')\n",
    "\n",
    "\n",
    "\n",
    "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n",
    "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)\n",
    "\n",
    "all_df = all_df.drop(all_df[all_df.num_tokens < 1].index)\n",
    "\n",
    "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n",
    "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)\n",
    "\n",
    "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n",
    "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)\n",
    "\n",
    "all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)\n",
    "all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)\n",
    "\n",
    "all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)\n",
    "all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)\n",
    "\n",
    "all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)\n",
    "all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)\n",
    "all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)\n",
    "all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)\n",
    "all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)\n",
    "\n",
    "all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)\n",
    "all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)\n",
    "all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)\n",
    "\n",
    "all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)\n",
    "all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)\n",
    "all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)\n",
    "all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)\n",
    "all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)\n",
    "\n",
    "all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)\n",
    "\n",
    "all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)\n",
    "all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)\n",
    "all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)\n",
    "all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)\n",
    "all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)\n",
    "\n",
    "all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "all_df['bow_nosw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)\n",
    "\n",
    "all_df['diy_cleaner'] = all_df.apply(lambda x: diy_cleaner(x[0]), axis=1)\n",
    "all_df['pruned'] = all_df.apply(lambda x: pruner(x['diy_cleaner']), axis=1)\n",
    "\n",
    "all_df['nltk_negs'] = all_df.apply(lambda x: get_nltk_negs(x['tokens']), axis=1)\n",
    "all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)\n",
    "all_df['bigram_feats'] = all_df.apply(lambda x: get_bigram_feats(x['tokens']), axis=1)\n",
    "all_df['bigram_feats_neg'] = all_df.apply(lambda x: get_bigram_feats(x['nltk_negs']), axis=1)\n",
    "\n",
    "big_bow = get_bow_from_column(all_df, 'pruned')\n",
    "big_bow_1 = get_bow_from_column(all_df[all_df['labels'] == 'yes'], 'pruned')\n",
    "big_bow_2 = get_bow_from_column(all_df[all_df['labels'] == 'no'], 'pruned')\n",
    "big_bow_3 = get_bow_from_column(all_df[all_df['labels'] == 'unknown'], 'pruned')\n",
    "# big_bow_4 = get_bow_from_column(all_df[all_df['labels'] == 4], 'pruned')\n",
    "# big_bow_5 = get_bow_from_column(all_df[all_df['labels'] == 5], 'pruned')\n",
    "\n",
    "# most_common_1 = [word[0] for word in big_bow_n.most_common(100)]\n",
    "# most_common_2 = [word[0] for word in big_bow_p.most_common(100)]\n",
    "\n",
    "\n",
    "\n",
    "# all_df['no_shared_words'] = all_df.apply(lambda x: get_only_polarized(x['tokens'], get_common_words(500)), axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>labels</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>topwords_unfil</th>\n",
       "      <th>topwords_fil</th>\n",
       "      <th>...</th>\n",
       "      <th>v_pos_fd</th>\n",
       "      <th>bow</th>\n",
       "      <th>bow_nosw</th>\n",
       "      <th>diy_cleaner</th>\n",
       "      <th>pruned</th>\n",
       "      <th>nltk_negs</th>\n",
       "      <th>unigram_feats</th>\n",
       "      <th>bigram_feats</th>\n",
       "      <th>bigram_feats_neg</th>\n",
       "      <th>PoN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>yeah first_person_pronoun want to address the ...</td>\n",
       "      <td>yes</td>\n",
       "      <td>[yeah, want, to, address, the, roundtree, fami...</td>\n",
       "      <td>48</td>\n",
       "      <td>[yeah first_person_pronoun want to address the...</td>\n",
       "      <td>1</td>\n",
       "      <td>[yeah, want, address, roundtree, family, apolo...</td>\n",
       "      <td>26</td>\n",
       "      <td>[(and, 6), (to, 5), (pronoun, 4), (the, 3), (f...</td>\n",
       "      <td>[(pronoun, 4), (family, 2), (im, 2), (yeah, 1)...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.356</td>\n",
       "      <td>{'yeah': 1, 'want': 1, 'to': 5, 'address': 1, ...</td>\n",
       "      <td>{'yeah': 1, 'want': 1, 'address': 1, 'roundtre...</td>\n",
       "      <td>yeah first_person_pronoun want to address the ...</td>\n",
       "      <td>yeah first_person_pronoun want address roundtr...</td>\n",
       "      <td>[yeah, want, to, address, the, roundtree, fami...</td>\n",
       "      <td>[to, the, and, and_NEG, family, pronoun, to_NE...</td>\n",
       "      <td>[yeah_want, want_to, to_address, address_the, ...</td>\n",
       "      <td>[yeah_want, want_to, to_address, address_the, ...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>umm pamela can pronoun hear first_person_prono...</td>\n",
       "      <td>yes</td>\n",
       "      <td>[umm, pamela, can, pronoun, hear, stephanie, h...</td>\n",
       "      <td>52</td>\n",
       "      <td>[umm pamela can pronoun hear first_person_pron...</td>\n",
       "      <td>1</td>\n",
       "      <td>[umm, pamela, pronoun, hear, stephanie, hardy,...</td>\n",
       "      <td>34</td>\n",
       "      <td>[(pronoun, 4), (and, 4), (yall, 4), (tell, 3),...</td>\n",
       "      <td>[(pronoun, 4), (yall, 4), (tell, 3), (love, 2)...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.272</td>\n",
       "      <td>{'umm': 1, 'pamela': 1, 'can': 1, 'pronoun': 4...</td>\n",
       "      <td>{'umm': 1, 'pamela': 1, 'pronoun': 4, 'hear': ...</td>\n",
       "      <td>umm pamela can pronoun hear first_person_prono...</td>\n",
       "      <td>pamela pronoun hear first_person_pronoun steph...</td>\n",
       "      <td>[umm, pamela, can, pronoun, hear, stephanie, h...</td>\n",
       "      <td>[pronoun, and, yall, tell, the, love, am, for,...</td>\n",
       "      <td>[umm_pamela, pamela_can, can_pronoun, pronoun_...</td>\n",
       "      <td>[umm_pamela, pamela_can, can_pronoun, pronoun_...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>its on september th kayla and david first_pers...</td>\n",
       "      <td>yes</td>\n",
       "      <td>[its, on, september, th, kayla, and, david, wa...</td>\n",
       "      <td>110</td>\n",
       "      <td>[its on september th kayla and david first_per...</td>\n",
       "      <td>1</td>\n",
       "      <td>[september, th, kayla, david, wanted, apologiz...</td>\n",
       "      <td>60</td>\n",
       "      <td>[(to, 6), (the, 5), (yall, 5), (pain, 4), (cau...</td>\n",
       "      <td>[(yall, 5), (pain, 4), (caused, 4), (apologize...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.199</td>\n",
       "      <td>{'its': 1, 'on': 1, 'september': 1, 'th': 1, '...</td>\n",
       "      <td>{'september': 1, 'th': 1, 'kayla': 1, 'david':...</td>\n",
       "      <td>its on september th kayla and david first_pers...</td>\n",
       "      <td>september kayla david first_person_pronoun wan...</td>\n",
       "      <td>[its, on, september, th, kayla, and, david, wa...</td>\n",
       "      <td>[yall_NEG, to, the_NEG, pain_NEG, caused_NEG, ...</td>\n",
       "      <td>[its_on, on_september, september_th, th_kayla,...</td>\n",
       "      <td>[its_on, on_september, september_th, th_kayla,...</td>\n",
       "      <td>yes</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0 labels  \\\n",
       "0  yeah first_person_pronoun want to address the ...    yes   \n",
       "1  umm pamela can pronoun hear first_person_prono...    yes   \n",
       "2  its on september th kayla and david first_pers...    yes   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0  [yeah, want, to, address, the, roundtree, fami...          48   \n",
       "1  [umm, pamela, can, pronoun, hear, stephanie, h...          52   \n",
       "2  [its, on, september, th, kayla, and, david, wa...         110   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0  [yeah first_person_pronoun want to address the...              1   \n",
       "1  [umm pamela can pronoun hear first_person_pron...              1   \n",
       "2  [its on september th kayla and david first_per...              1   \n",
       "\n",
       "                                               no_sw  num_no_sw  \\\n",
       "0  [yeah, want, address, roundtree, family, apolo...         26   \n",
       "1  [umm, pamela, pronoun, hear, stephanie, hardy,...         34   \n",
       "2  [september, th, kayla, david, wanted, apologiz...         60   \n",
       "\n",
       "                                      topwords_unfil  \\\n",
       "0  [(and, 6), (to, 5), (pronoun, 4), (the, 3), (f...   \n",
       "1  [(pronoun, 4), (and, 4), (yall, 4), (tell, 3),...   \n",
       "2  [(to, 6), (the, 5), (yall, 5), (pain, 4), (cau...   \n",
       "\n",
       "                                        topwords_fil  ... v_pos_fd  \\\n",
       "0  [(pronoun, 4), (family, 2), (im, 2), (yeah, 1)...  ...    0.356   \n",
       "1  [(pronoun, 4), (yall, 4), (tell, 3), (love, 2)...  ...    0.272   \n",
       "2  [(yall, 5), (pain, 4), (caused, 4), (apologize...  ...    0.199   \n",
       "\n",
       "                                                 bow  \\\n",
       "0  {'yeah': 1, 'want': 1, 'to': 5, 'address': 1, ...   \n",
       "1  {'umm': 1, 'pamela': 1, 'can': 1, 'pronoun': 4...   \n",
       "2  {'its': 1, 'on': 1, 'september': 1, 'th': 1, '...   \n",
       "\n",
       "                                            bow_nosw  \\\n",
       "0  {'yeah': 1, 'want': 1, 'address': 1, 'roundtre...   \n",
       "1  {'umm': 1, 'pamela': 1, 'pronoun': 4, 'hear': ...   \n",
       "2  {'september': 1, 'th': 1, 'kayla': 1, 'david':...   \n",
       "\n",
       "                                         diy_cleaner  \\\n",
       "0  yeah first_person_pronoun want to address the ...   \n",
       "1  umm pamela can pronoun hear first_person_prono...   \n",
       "2  its on september th kayla and david first_pers...   \n",
       "\n",
       "                                              pruned  \\\n",
       "0  yeah first_person_pronoun want address roundtr...   \n",
       "1  pamela pronoun hear first_person_pronoun steph...   \n",
       "2  september kayla david first_person_pronoun wan...   \n",
       "\n",
       "                                           nltk_negs  \\\n",
       "0  [yeah, want, to, address, the, roundtree, fami...   \n",
       "1  [umm, pamela, can, pronoun, hear, stephanie, h...   \n",
       "2  [its, on, september, th, kayla, and, david, wa...   \n",
       "\n",
       "                                       unigram_feats  \\\n",
       "0  [to, the, and, and_NEG, family, pronoun, to_NE...   \n",
       "1  [pronoun, and, yall, tell, the, love, am, for,...   \n",
       "2  [yall_NEG, to, the_NEG, pain_NEG, caused_NEG, ...   \n",
       "\n",
       "                                        bigram_feats  \\\n",
       "0  [yeah_want, want_to, to_address, address_the, ...   \n",
       "1  [umm_pamela, pamela_can, can_pronoun, pronoun_...   \n",
       "2  [its_on, on_september, september_th, th_kayla,...   \n",
       "\n",
       "                                    bigram_feats_neg  PoN  \n",
       "0  [yeah_want, want_to, to_address, address_the, ...  yes  \n",
       "1  [umm_pamela, pamela_can, can_pronoun, pronoun_...  yes  \n",
       "2  [its_on, on_september, september_th, th_kayla,...  yes  \n",
       "\n",
       "[3 rows x 40 columns]"
      ]
     },
     "execution_count": 31,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df['PoN'] = all_df['labels']\n",
    "all_df[:3]\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SAVE TO CSV!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "# all_df.to_csv('hw7_data_sentiment.csv',index=False)\n",
    "# all_df['PoN'] = all_df['labels']\n",
    "# all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores -- Gaussian\n",
      "Accuracy: 0.5058823529411764\n",
      "Accuracy: 0.5117647058823529\n",
      "Accuracy: 0.49411764705882355\n",
      "Accuracy: 0.5176470588235295\n",
      "Accuracy: 0.5176470588235295\n",
      "AVERAGE ACCURACY: 0.5094117647058823\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu'])\n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores -- Gaussian')\n",
    "# display_NB_tables(tables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Positive Vader Scores -- Multinomial\n",
      "Accuracy: 0.5470588235294118\n",
      "Accuracy: 0.5764705882352941\n",
      "Accuracy: 0.5058823529411764\n",
      "Accuracy: 0.5176470588235295\n",
      "Accuracy: 0.5352941176470588\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "AVERAGE ACCURACY: 0.536470588235294\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_pos','v_neu'])\n",
    "tables = get_NB(small_df, all_df['PoN'], MultinomialNB(), 'Positive Vader Scores -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores from Summary -- Gaussian\n",
      "Accuracy: 0.06470588235294118\n",
      "Accuracy: 0.06470588235294118\n",
      "Accuracy: 0.10588235294117647\n",
      "Accuracy: 0.06470588235294118\n",
      "Accuracy: 0.08823529411764706\n",
      "AVERAGE ACCURACY: 0.07764705882352942\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) \n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores from Summary -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores (original) and Vader Scores (summary) -- Gaussian\n",
      "Accuracy: 0.07058823529411765\n",
      "Accuracy: 0.07058823529411765\n",
      "Accuracy: 0.08823529411764706\n",
      "Accuracy: 0.10588235294117647\n",
      "Accuracy: 0.1\n",
      "AVERAGE ACCURACY: 0.08705882352941177\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n",
    "                          'v_compound','v_pos', 'v_neg', 'v_neu'])\n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores (original) and Vader Scores (summary) -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores 50 most frequent filtered words -- Gaussian\n",
      "Accuracy: 0.47058823529411764\n",
      "Accuracy: 0.45294117647058824\n",
      "Accuracy: 0.4588235294117647\n",
      "Accuracy: 0.49411764705882355\n",
      "Accuracy: 0.5235294117647059\n",
      "AVERAGE ACCURACY: 0.48\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd'])\n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores 50 most frequent filtered words -- Gaussian')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bag of Words & Machine Learning "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting point -- Gaussian\n",
      "Accuracy: 0.34705882352941175\n",
      "Accuracy: 0.34705882352941175\n",
      "Accuracy: 0.3588235294117647\n",
      "Accuracy: 0.4176470588235294\n",
      "Accuracy: 0.4294117647058823\n",
      "AVERAGE ACCURACY: 0.38\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v1'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "all_df\n",
    "new_df = pd.DataFrame(all_df['bow_v1'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Starting point -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting point -- Gaussian\n",
      "Accuracy: 0.34705882352941175\n",
      "Accuracy: 0.34705882352941175\n",
      "Accuracy: 0.3588235294117647\n",
      "Accuracy: 0.4176470588235294\n",
      "Accuracy: 0.4294117647058823\n",
      "AVERAGE ACCURACY: 0.38\n"
     ]
    }
   ],
   "source": [
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Starting point -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting point -- Multinomial\n",
      "Accuracy: 0.4647058823529412\n",
      "Accuracy: 0.5\n",
      "Accuracy: 0.48823529411764705\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5117647058823529\n",
      "Accuracy: 0.5058823529411764\n",
      "AVERAGE ACCURACY: 0.49411764705882355\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v1'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v1'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Starting point -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting point -- Bernoulli\n",
      "Accuracy: 0.48823529411764705\n",
      "Accuracy: 0.49411764705882355\n",
      "Accuracy: 0.5470588235294118\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.4588235294117647\n",
      "Accuracy: 0.5117647058823529\n",
      "AVERAGE ACCURACY: 0.5\n"
     ]
    }
   ],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Starting point -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DIY Cleaner -- Gaussian\n",
      "Accuracy: 0.34705882352941175\n",
      "Accuracy: 0.34705882352941175\n",
      "Accuracy: 0.3588235294117647\n",
      "Accuracy: 0.4176470588235294\n",
      "Accuracy: 0.4294117647058823\n",
      "AVERAGE ACCURACY: 0.38\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'DIY Cleaner -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DIY Cleaner -- Multinomial\n",
      "Accuracy: 0.4647058823529412\n",
      "Accuracy: 0.4823529411764706\n",
      "Accuracy: 0.47058823529411764\n",
      "Accuracy: 0.48823529411764705\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.48823529411764705\n",
      "AVERAGE ACCURACY: 0.4788235294117647\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'DIY Cleaner -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "DIY Cleaner -- Bernoulli\n",
      "Accuracy: 0.48823529411764705\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5\n",
      "Accuracy: 0.5470588235294118\n",
      "Accuracy: 0.4588235294117647\n",
      "Accuracy: 0.5117647058823529\n",
      "AVERAGE ACCURACY: 0.5011764705882353\n"
     ]
    }
   ],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'DIY Cleaner -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pruned Words -- Gaussian\n",
      "Accuracy: 0.37058823529411766\n",
      "Accuracy: 0.34705882352941175\n",
      "Accuracy: 0.36470588235294116\n",
      "Accuracy: 0.40588235294117647\n",
      "Accuracy: 0.43529411764705883\n",
      "AVERAGE ACCURACY: 0.3847058823529412\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Pruned Words -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pruned Words -- Multinomial\n",
      "Accuracy: 0.4823529411764706\n",
      "Accuracy: 0.49411764705882355\n",
      "Accuracy: 0.47058823529411764\n",
      "Accuracy: 0.5\n",
      "Accuracy: 0.4588235294117647\n",
      "AVERAGE ACCURACY: 0.48117647058823537\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Pruned Words -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Pruned Words -- Bernoulli\n",
      "Accuracy: 0.5\n",
      "Accuracy: 0.5117647058823529\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5411764705882353\n",
      "Accuracy: 0.45294117647058824\n",
      "Accuracy: 0.5235294117647059\n",
      "AVERAGE ACCURACY: 0.5058823529411764\n"
     ]
    }
   ],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Pruned Words -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NLTK negs -- Gaussian\n",
      "Accuracy: 0.32941176470588235\n",
      "Accuracy: 0.36470588235294116\n",
      "Accuracy: 0.4\n",
      "Accuracy: 0.43529411764705883\n",
      "Accuracy: 0.4588235294117647\n",
      "AVERAGE ACCURACY: 0.3976470588235294\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'NLTK negs -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 49,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NLTK negs -- Multinomial\n",
      "Accuracy: 0.32941176470588235\n",
      "Accuracy: 0.36470588235294116\n",
      "Accuracy: 0.4\n",
      "Accuracy: 0.43529411764705883\n",
      "Accuracy: 0.4588235294117647\n",
      "AVERAGE ACCURACY: 0.3976470588235294\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'NLTK negs -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "NLTK negs -- Bernoulli\n",
      "Accuracy: 0.5176470588235295\n",
      "Accuracy: 0.5235294117647059\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5823529411764706\n",
      "Accuracy: 0.4647058823529412\n",
      "Accuracy: 0.5\n",
      "AVERAGE ACCURACY: 0.5176470588235295\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    }
   ],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'NLTK negs -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bigram Feats -- Gaussian\n",
      "Accuracy: 0.40588235294117647\n",
      "Accuracy: 0.4176470588235294\n",
      "Accuracy: 0.4117647058823529\n",
      "Accuracy: 0.40588235294117647\n",
      "Accuracy: 0.4647058823529412\n",
      "AVERAGE ACCURACY: 0.4211764705882353\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Bigram Feats -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bigram Feats -- Multinomial\n",
      "Accuracy: 0.5117647058823529\n",
      "Accuracy: 0.5470588235294118\n",
      "Accuracy: 0.45294117647058824\n",
      "Accuracy: 0.5\n",
      "Accuracy: 0.4823529411764706\n",
      "AVERAGE ACCURACY: 0.4988235294117647\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Bigram Feats -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Bigram Feats -- Bernoulli\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5470588235294118\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5588235294117647\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5294117647058824\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.48823529411764705\n",
      "Accuracy: 0.5235294117647059\n",
      "AVERAGE ACCURACY: 0.5294117647058824\n"
     ]
    }
   ],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Bigram Feats -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "# all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)\n",
    "# new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])\n",
    "# new_df = new_df.fillna(0).astype(int)\n",
    "# new_df[:5]\n",
    "# tables = get_NB(new_df, new_df.index, GaussianNB(), 'No Shared Words -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'No Shared Words -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'No Shared Words -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_cloud_with_mask('yellow_square.png', big_bow, 750, \"Top Words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_cloud_with_mask('red_square.png', big_bow_n, 750, \"Top Negative Words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_cloud_with_mask('green_square.png', big_bow_p, 750, \"Top Positive Words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n",
    "\n",
    "def runPipeline(classifier, boolean, cv, X, y):\n",
    "    nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=boolean)),('nb', classifier)])\n",
    "    scores = cross_val_score(nb_clf_pipe, X, y, cv=cv)\n",
    "    avg=sum(scores)/len(scores)\n",
    "#     pretty_line = \"{} | Accuracy using {} -- and booleans? {}\"\n",
    "    pretty_line = \"{} | B? {} | CV: {} | Classifier: {}\"\n",
    "    print(pretty_line.format(avg, str(boolean)[0], cv, str(classifier).split('(')[0]))\n",
    "\n",
    "# X = array of data\n",
    "# y = array of labels\n",
    "\n",
    "hw6 = all_df[[0,'PoN']]\n",
    "X = hw6[0].tolist()\n",
    "y = hw6['PoN'].tolist()\n",
    "\n",
    "runPipeline(BernoulliNB(), False, 5, X=X, y=y)\n",
    "runPipeline(BernoulliNB(), False, 3, X=X, y=y)\n",
    "runPipeline(MultinomialNB(), False, 5, X=X, y=y)\n",
    "runPipeline(MultinomialNB(), False, 3, X=X, y=y)\n",
    "runPipeline(MultinomialNB(), True, 5,  X=X, y=y)\n",
    "runPipeline(MultinomialNB(), True, 3,  X=X, y=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tabulate import tabulate\n",
    "\n",
    "df = hw6\n",
    "def shorten(long_string):\n",
    "    return long_string[:1] if len(long_string) < 21 else long_string[:20]\n",
    "\n",
    "def df_for_tabulate(df, column):\n",
    "    pretty_df = df.copy()\n",
    "    pretty_df[column] = pretty_df.apply(lambda x: shorten(x[column]), axis = 1)\n",
    "    return pretty_df\n",
    "    \n",
    "tabulate_df = df_for_tabulate(df, 0)\n",
    "print(tabulate(tabulate_df[:10], tablefmt=\"simple\", headers=tabulate_df.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
