{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW4 PIPELINE + HW6 + HW7\n",
    "## Building off HW2 + HW3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "## =======================================================\n",
    "## IMPORTING\n",
    "## =======================================================\n",
    "import os\n",
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file)\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "## =======================================================\n",
    "## TOKENIZING\n",
    "## =======================================================\n",
    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "def get_sentence_tokens(review):\n",
    "    return sent_tokenize(review)\n",
    "\n",
    "## =======================================================\n",
    "## REMOVING STOPWORDS\n",
    "## =======================================================\n",
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "def remove_stopwords(sentence):\n",
    "    filtered_text = []\n",
    "    for word in sentence:\n",
    "        if word not in stop_words:\n",
    "            filtered_text.append(word)\n",
    "    return filtered_text\n",
    "\n",
    "## =======================================================\n",
    "## FREQUENCY DISTRIBUTIONS\n",
    "## =======================================================\n",
    "from nltk.probability import FreqDist\n",
    "def get_most_common(tokens):\n",
    "    fdist = FreqDist(tokens)\n",
    "    return fdist.most_common(12)\n",
    "\n",
    "def get_most_common(tokens):\n",
    "    fdist = FreqDist(tokens)\n",
    "    return fdist.most_common(12)\n",
    "\n",
    "def get_fdist(tokens):\n",
    "    return (FreqDist(tokens))\n",
    "\n",
    "## =======================================================\n",
    "## SENTIMENT ANALYSIS\n",
    "## =======================================================\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "sid = SentimentIntensityAnalyzer()\n",
    "\n",
    "def get_vader_score(review):\n",
    "    return sid.polarity_scores(review)\n",
    "\n",
    "def separate_vader_score(vader_score, key):\n",
    "    return vader_score[key]\n",
    "\n",
    "## =======================================================\n",
    "## SUMMARIZER\n",
    "## =======================================================\n",
    "def get_weighted_freq_dist(review, freq_dist):\n",
    "    try:\n",
    "        max_freq = max(freq_dist.values())\n",
    "        for word in freq_dist.keys():\n",
    "            freq_dist[word] = (freq_dist[word]/max_freq)\n",
    "        return freq_dist\n",
    "    except:\n",
    "        for word in freq_dist.keys():\n",
    "            freq_dist[word] = (freq_dist[word]/1)\n",
    "        return freq_dist\n",
    "        \n",
    "\n",
    "def get_sentence_score(review, freq_dist):\n",
    "    sentence_scores = {}\n",
    "    for sent in review:\n",
    "        for word in nltk.word_tokenize(sent.lower()):\n",
    "            if word in freq_dist.keys():\n",
    "                if len(sent.split(' ')) < 30:\n",
    "                    if sent not in sentence_scores.keys():\n",
    "                        sentence_scores[sent] = freq_dist[word]\n",
    "                    else:\n",
    "                        sentence_scores[sent] += freq_dist[word]\n",
    "    return sentence_scores\n",
    "\n",
    "def get_summary_sentences(sentence_scores):\n",
    "    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)\n",
    "    return ''.join(sent[0] for sent in sorted_sentences[:5])\n",
    "\n",
    "def get_freq_words(freq_dist):\n",
    "    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)\n",
    "    return ' '.join(word[0] for word in sorted_words[:50])\n",
    "\n",
    "## =======================================================\n",
    "## MACHINE LEARNING -- NAIVE BAYES\n",
    "## =======================================================\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import GaussianNB, MultinomialNB\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import confusion_matrix, classification_report\n",
    "\n",
    "# def get_NB(small_df, labels):\n",
    "#     x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n",
    "\n",
    "#     gnb = GaussianNB()\n",
    "#     gnb.fit(x_train, y_train)\n",
    "#     y_pred = gnb.predict(x_test)\n",
    "#     print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))\n",
    "    \n",
    "    \n",
    "def get_NB(small_df, labels, classifier, title):\n",
    "    seeds = [109, 210, 420, 19, 7]\n",
    "    dfs = []\n",
    "    overall = []\n",
    "    print(title)\n",
    "    for seed in seeds:\n",
    "        x_train, x_test, y_train, y_test = train_test_split(small_df.values, \n",
    "                                                            labels, test_size=0.3, random_state = seed)\n",
    "        gnb = classifier\n",
    "        gnb.fit(x_train, y_train).score(x_train, y_train)\n",
    "        y_pred = gnb.predict(x_test)\n",
    "        accuracy =  metrics.accuracy_score(y_test, y_pred)\n",
    "        report = metrics.classification_report(y_test, y_pred)\n",
    "        print(\"Accuracy:\", accuracy)\n",
    "#         print(report)\n",
    "        overall.append(accuracy)\n",
    "        cm = confusion_matrix(y_test, y_pred)\n",
    "        # confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\n",
    "        tn, fp, fn, tp = cm.ravel()\n",
    "        df = pd.DataFrame({ 'labels': ['accuracy','true neg', 'false pos', 'false neg', 'true pos'], \n",
    "                           'nums': [accuracy, tn, fp, fn, tp] })\n",
    "        dfs.append(df)\n",
    "    print('AVERAGE ACCURACY:', sum(overall)/len(overall))\n",
    "    return dfs\n",
    "\n",
    "\n",
    "def display_NB_tables(dfs):\n",
    "    for df in dfs:\n",
    "        print(display(df))\n",
    "        \n",
    "## =======================================================\n",
    "## PLOTS\n",
    "## =======================================================        \n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt \n",
    "def bar_plot(df, title): \n",
    "    graph = sns.barplot(y = \"count\", x = \"word\", data = df, palette = \"husl\")\n",
    "    plt.title(title)\n",
    "    plt.xlabel(\"Word\")\n",
    "    plt.ylabel(\"Count\")\n",
    "    sns.set_context(\"talk\")\n",
    "    plt.xticks(rotation = 90)\n",
    "    return plt\n",
    "\n",
    "from nltk.tokenize import casual_tokenize\n",
    "from collections import Counter\n",
    "\n",
    "\n",
    "## =======================================================\n",
    "## CLEANERS\n",
    "## =======================================================   \n",
    "import re, string\n",
    "def diy_cleaner(review):\n",
    "    try:\n",
    "        both = review.split('\\n')\n",
    "        title = both[0]\n",
    "        review = both[1]\n",
    "        review = review.replace(\"'\",\"\")\n",
    "    except:\n",
    "        review = review.replace(\"'\",\"\")\n",
    "    pattern = re.compile('[\\W_]+')\n",
    "    review = pattern.sub(' ', review)\n",
    "    cleaned = title + ' ' + title + ' ' + review\n",
    "    return cleaned.lower()\n",
    "\n",
    "def pruner(review):\n",
    "    clean_review = ' '.join([word for word in review.split() if len(word) > 3])\n",
    "    return clean_review\n",
    "\n",
    "sentim_analyzer = SentimentAnalyzer()\n",
    "def get_nltk_negs(tokens):\n",
    "    all_words_neg = sentim_analyzer.all_words([mark_negation(tokens)])\n",
    "    return all_words_neg\n",
    "\n",
    "def get_unigram_feats(neg_tokens):\n",
    "    unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)\n",
    "    return unigram_feats\n",
    "    \n",
    "def get_bigram_feats(tokens):\n",
    "    ngrams = zip(*[tokens[i:] for i in range(2)])\n",
    "    return [\"_\".join(ngram) for ngram in ngrams]\n",
    "\n",
    "## =======================================================\n",
    "## HELPERS\n",
    "## =======================================================  \n",
    "def get_bow_from_column(df, column):\n",
    "    all_column_data = ' '.join(df[column].tolist())\n",
    "    all_column_fd = Counter(all_column_data.split())\n",
    "    return all_column_fd\n",
    "\n",
    "def get_common_words(num):\n",
    "    most_common_neg = [word[0] for word in big_bow_n.most_common(num)]\n",
    "    most_common_pos = [word[0] for word in big_bow_p.most_common(num)]\n",
    "    in_both = np.intersect1d(most_common_neg, most_common_pos)\n",
    "    neg_notpos = np.setdiff1d(most_common_neg, most_common_pos)\n",
    "    pos_notneg = np.setdiff1d(most_common_pos, most_common_neg)\n",
    "    return [len(in_both), len(neg_notpos), len(pos_notneg), len(in_both)/num, in_both, neg_notpos, pos_notneg]\n",
    "\n",
    "def get_only_polarized(tokens, common_words):\n",
    "    return [token for token in tokens if token not in common_words[4]] # 70\n",
    "\n",
    "## =======================================================\n",
    "## VISUALS\n",
    "## =======================================================  \n",
    "import wordcloud\n",
    "from wordcloud import WordCloud, ImageColorGenerator\n",
    "from PIL import Image\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt \n",
    "def create_word_cloud_with_mask(path_of_mask_image, dictionary, \n",
    "                                max_num_words, title):\n",
    "        mask = np.array(Image.open(path_of_mask_image))\n",
    "        word_cloud = WordCloud(background_color = \"white\", \n",
    "                               max_words = max_num_words, \n",
    "                               mask = mask, max_font_size = 125, \n",
    "                               random_state = 1006)\n",
    "        word_cloud.generate_from_frequencies(dictionary)\n",
    "        image_colors = ImageColorGenerator(mask)\n",
    "        plt.figure(figsize = [8,8])\n",
    "        plt.imshow(word_cloud.recolor(color_func = image_colors), interpolation = \"bilinear\")\n",
    "        plt.title(title)\n",
    "        sns.set_context(\"poster\")\n",
    "        plt.axis(\"off\")\n",
    "        return plt\n",
    "    \n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt \n",
    "def bar_plot(df, title): \n",
    "    graph = sns.barplot(y = \"count\", x = \"word\", data = df, palette = \"husl\")\n",
    "    plt.title(title)\n",
    "    plt.xlabel(\"Word\")\n",
    "    plt.ylabel(\"Count\")\n",
    "    sns.set_context(\"talk\")\n",
    "    plt.xticks(rotation = 90)\n",
    "    return plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>labels</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>series</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156055</td>\n",
       "      <td>Hearst 's</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156056</td>\n",
       "      <td>forced avuncular chortles</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156057</td>\n",
       "      <td>avuncular chortles</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156058</td>\n",
       "      <td>avuncular</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156059</td>\n",
       "      <td>chortles</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>156060 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                        0  labels\n",
       "0       A series of escapades demonstrating the adage ...       1\n",
       "1       A series of escapades demonstrating the adage ...       2\n",
       "2                                                A series       2\n",
       "3                                                       A       2\n",
       "4                                                  series       2\n",
       "...                                                   ...     ...\n",
       "156055                                          Hearst 's       2\n",
       "156056                          forced avuncular chortles       1\n",
       "156057                                 avuncular chortles       3\n",
       "156058                                          avuncular       2\n",
       "156059                                           chortles       2\n",
       "\n",
       "[156060 rows x 2 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "\n",
    "train=pd.read_csv(\"../WK7/kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values\n",
    "\n",
    "all_df = pd.DataFrame(X)\n",
    "all_df['labels'] = y\n",
    "all_df\n",
    "\n",
    "\n",
    "# neg_df = pd.DataFrame(neg)\n",
    "# pos_df = pd.DataFrame(pos)\n",
    "\n",
    "# pos_df['PoN'] = 'P'\n",
    "# neg_df['PoN'] = 'N'\n",
    "# all_df = neg_df.append(pos_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# neg = get_data_from_files('../NEG_JK/')\n",
    "# pos = get_data_from_files('../POS_JK/')\n",
    "\n",
    "# neg = get_data_from_files('../neg_cornell/')\n",
    "# pos = get_data_from_files('../pos_cornell/')\n",
    "\n",
    "# neg = get_data_from_files('../neg_hw4/')\n",
    "# pos = get_data_from_files('../pos_hw4/')\n",
    "\n",
    "# neg = get_data_from_files('../hw4_lie_false/')\n",
    "# pos = get_data_from_files('../hw4_lie_true/')\n",
    "\n",
    "# pos = get_data_from_files('../hw4_lie_false/')\n",
    "# neg = get_data_from_files('../hw4_lie_true/')\n",
    "\n",
    "\n",
    "\n",
    "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n",
    "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)\n",
    "\n",
    "all_df = all_df.drop(all_df[all_df.num_tokens < 1].index)\n",
    "\n",
    "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n",
    "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)\n",
    "\n",
    "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n",
    "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)\n",
    "\n",
    "all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)\n",
    "all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)\n",
    "\n",
    "all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)\n",
    "all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)\n",
    "\n",
    "all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)\n",
    "all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)\n",
    "all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)\n",
    "all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)\n",
    "all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)\n",
    "\n",
    "all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)\n",
    "all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)\n",
    "all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)\n",
    "\n",
    "all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)\n",
    "all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)\n",
    "all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)\n",
    "all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)\n",
    "all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)\n",
    "\n",
    "all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)\n",
    "\n",
    "all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)\n",
    "all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)\n",
    "all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)\n",
    "all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)\n",
    "all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)\n",
    "\n",
    "all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "all_df['bow_nosw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)\n",
    "\n",
    "all_df['diy_cleaner'] = all_df.apply(lambda x: diy_cleaner(x[0]), axis=1)\n",
    "all_df['pruned'] = all_df.apply(lambda x: pruner(x['diy_cleaner']), axis=1)\n",
    "\n",
    "all_df['nltk_negs'] = all_df.apply(lambda x: get_nltk_negs(x['tokens']), axis=1)\n",
    "all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)\n",
    "all_df['bigram_feats'] = all_df.apply(lambda x: get_bigram_feats(x['tokens']), axis=1)\n",
    "all_df['bigram_feats_neg'] = all_df.apply(lambda x: get_bigram_feats(x['nltk_negs']), axis=1)\n",
    "\n",
    "big_bow = get_bow_from_column(all_df, 'pruned')\n",
    "big_bow_1 = get_bow_from_column(all_df[all_df['labels'] == 1], 'pruned')\n",
    "big_bow_2 = get_bow_from_column(all_df[all_df['labels'] == 2], 'pruned')\n",
    "big_bow_3 = get_bow_from_column(all_df[all_df['labels'] == 3], 'pruned')\n",
    "big_bow_4 = get_bow_from_column(all_df[all_df['labels'] == 4], 'pruned')\n",
    "big_bow_5 = get_bow_from_column(all_df[all_df['labels'] == 5], 'pruned')\n",
    "\n",
    "# most_common_1 = [word[0] for word in big_bow_n.most_common(100)]\n",
    "# most_common_2 = [word[0] for word in big_bow_p.most_common(100)]\n",
    "\n",
    "\n",
    "\n",
    "# all_df['no_shared_words'] = all_df.apply(lambda x: get_only_polarized(x['tokens'], get_common_words(500)), axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>labels</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>topwords_unfil</th>\n",
       "      <th>topwords_fil</th>\n",
       "      <th>...</th>\n",
       "      <th>v_neu_fd</th>\n",
       "      <th>v_pos_fd</th>\n",
       "      <th>bow</th>\n",
       "      <th>bow_nosw</th>\n",
       "      <th>diy_cleaner</th>\n",
       "      <th>pruned</th>\n",
       "      <th>nltk_negs</th>\n",
       "      <th>unigram_feats</th>\n",
       "      <th>bigram_feats</th>\n",
       "      <th>bigram_feats_neg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>35</td>\n",
       "      <td>[A series of escapades demonstrating the adage...</td>\n",
       "      <td>1</td>\n",
       "      <td>[series, escapades, demonstrating, adage, good...</td>\n",
       "      <td>15</td>\n",
       "      <td>[(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)...</td>\n",
       "      <td>[(good, 2), (series, 1), (escapades, 1), (demo...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.693</td>\n",
       "      <td>0.307</td>\n",
       "      <td>{'a': 2, 'series': 1, 'of': 4, 'escapades': 1,...</td>\n",
       "      <td>{'series': 1, 'escapades': 1, 'demonstrating':...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>[the, of, is, good, for, of_NEG, a, series, es...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>14</td>\n",
       "      <td>[A series of escapades demonstrating the adage...</td>\n",
       "      <td>1</td>\n",
       "      <td>[series, escapades, demonstrating, adage, good...</td>\n",
       "      <td>6</td>\n",
       "      <td>[(the, 2), (a, 1), (series, 1), (of, 1), (esca...</td>\n",
       "      <td>[(series, 1), (escapades, 1), (demonstrating, ...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.633</td>\n",
       "      <td>0.367</td>\n",
       "      <td>{'a': 1, 'series': 1, 'of': 1, 'escapades': 1,...</td>\n",
       "      <td>{'series': 1, 'escapades': 1, 'demonstrating':...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>[the, a, series, of, escapades, demonstrating,...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>2</td>\n",
       "      <td>[A series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(a, 1), (series, 1)]</td>\n",
       "      <td>[(series, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>1.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'a': 1, 'series': 1}</td>\n",
       "      <td>{'series': 1}</td>\n",
       "      <td>a series a series a series</td>\n",
       "      <td>series series series</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>[a_series]</td>\n",
       "      <td>[a_series]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 39 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0  labels  \\\n",
       "0  A series of escapades demonstrating the adage ...       1   \n",
       "1  A series of escapades demonstrating the adage ...       2   \n",
       "2                                           A series       2   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0  [a, series, of, escapades, demonstrating, the,...          35   \n",
       "1  [a, series, of, escapades, demonstrating, the,...          14   \n",
       "2                                        [a, series]           2   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0  [A series of escapades demonstrating the adage...              1   \n",
       "1  [A series of escapades demonstrating the adage...              1   \n",
       "2                                         [A series]              1   \n",
       "\n",
       "                                               no_sw  num_no_sw  \\\n",
       "0  [series, escapades, demonstrating, adage, good...         15   \n",
       "1  [series, escapades, demonstrating, adage, good...          6   \n",
       "2                                           [series]          1   \n",
       "\n",
       "                                      topwords_unfil  \\\n",
       "0  [(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)...   \n",
       "1  [(the, 2), (a, 1), (series, 1), (of, 1), (esca...   \n",
       "2                              [(a, 1), (series, 1)]   \n",
       "\n",
       "                                        topwords_fil  ... v_neu_fd v_pos_fd  \\\n",
       "0  [(good, 2), (series, 1), (escapades, 1), (demo...  ...    0.693    0.307   \n",
       "1  [(series, 1), (escapades, 1), (demonstrating, ...  ...    0.633    0.367   \n",
       "2                                      [(series, 1)]  ...    1.000    0.000   \n",
       "\n",
       "                                                 bow  \\\n",
       "0  {'a': 2, 'series': 1, 'of': 4, 'escapades': 1,...   \n",
       "1  {'a': 1, 'series': 1, 'of': 1, 'escapades': 1,...   \n",
       "2                              {'a': 1, 'series': 1}   \n",
       "\n",
       "                                            bow_nosw  \\\n",
       "0  {'series': 1, 'escapades': 1, 'demonstrating':...   \n",
       "1  {'series': 1, 'escapades': 1, 'demonstrating':...   \n",
       "2                                      {'series': 1}   \n",
       "\n",
       "                                         diy_cleaner  \\\n",
       "0  a series of escapades demonstrating the adage ...   \n",
       "1  a series of escapades demonstrating the adage ...   \n",
       "2                         a series a series a series   \n",
       "\n",
       "                                              pruned  \\\n",
       "0  series escapades demonstrating adage that what...   \n",
       "1  series escapades demonstrating adage that what...   \n",
       "2                               series series series   \n",
       "\n",
       "                                           nltk_negs  \\\n",
       "0  [a, series, of, escapades, demonstrating, the,...   \n",
       "1  [a, series, of, escapades, demonstrating, the,...   \n",
       "2                                        [a, series]   \n",
       "\n",
       "                                       unigram_feats  \\\n",
       "0  [the, of, is, good, for, of_NEG, a, series, es...   \n",
       "1  [the, a, series, of, escapades, demonstrating,...   \n",
       "2                                        [a, series]   \n",
       "\n",
       "                                        bigram_feats  \\\n",
       "0  [a_series, series_of, of_escapades, escapades_...   \n",
       "1  [a_series, series_of, of_escapades, escapades_...   \n",
       "2                                         [a_series]   \n",
       "\n",
       "                                    bigram_feats_neg  \n",
       "0  [a_series, series_of, of_escapades, escapades_...  \n",
       "1  [a_series, series_of, of_escapades, escapades_...  \n",
       "2                                         [a_series]  \n",
       "\n",
       "[3 rows x 39 columns]"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SAVE TO CSV!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>labels</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>topwords_unfil</th>\n",
       "      <th>topwords_fil</th>\n",
       "      <th>...</th>\n",
       "      <th>v_pos_fd</th>\n",
       "      <th>bow</th>\n",
       "      <th>bow_nosw</th>\n",
       "      <th>diy_cleaner</th>\n",
       "      <th>pruned</th>\n",
       "      <th>nltk_negs</th>\n",
       "      <th>unigram_feats</th>\n",
       "      <th>bigram_feats</th>\n",
       "      <th>bigram_feats_neg</th>\n",
       "      <th>PoN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>35</td>\n",
       "      <td>[A series of escapades demonstrating the adage...</td>\n",
       "      <td>1</td>\n",
       "      <td>[series, escapades, demonstrating, adage, good...</td>\n",
       "      <td>15</td>\n",
       "      <td>[(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)...</td>\n",
       "      <td>[(good, 2), (series, 1), (escapades, 1), (demo...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.307</td>\n",
       "      <td>{'a': 2, 'series': 1, 'of': 4, 'escapades': 1,...</td>\n",
       "      <td>{'series': 1, 'escapades': 1, 'demonstrating':...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>[the, of, is, good, for, of_NEG, a, series, es...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>14</td>\n",
       "      <td>[A series of escapades demonstrating the adage...</td>\n",
       "      <td>1</td>\n",
       "      <td>[series, escapades, demonstrating, adage, good...</td>\n",
       "      <td>6</td>\n",
       "      <td>[(the, 2), (a, 1), (series, 1), (of, 1), (esca...</td>\n",
       "      <td>[(series, 1), (escapades, 1), (demonstrating, ...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.367</td>\n",
       "      <td>{'a': 1, 'series': 1, 'of': 1, 'escapades': 1,...</td>\n",
       "      <td>{'series': 1, 'escapades': 1, 'demonstrating':...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>[the, a, series, of, escapades, demonstrating,...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>2</td>\n",
       "      <td>[A series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(a, 1), (series, 1)]</td>\n",
       "      <td>[(series, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'a': 1, 'series': 1}</td>\n",
       "      <td>{'series': 1}</td>\n",
       "      <td>a series a series a series</td>\n",
       "      <td>series series series</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>[a_series]</td>\n",
       "      <td>[a_series]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "      <td>[a]</td>\n",
       "      <td>1</td>\n",
       "      <td>[A]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[(a, 1)]</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'a': 1}</td>\n",
       "      <td>{}</td>\n",
       "      <td>a a a</td>\n",
       "      <td></td>\n",
       "      <td>[a]</td>\n",
       "      <td>[a]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>series</td>\n",
       "      <td>2</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(series, 1)]</td>\n",
       "      <td>[(series, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'series': 1}</td>\n",
       "      <td>{'series': 1}</td>\n",
       "      <td>series series series</td>\n",
       "      <td>series series series</td>\n",
       "      <td>[series]</td>\n",
       "      <td>[series]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156055</td>\n",
       "      <td>Hearst 's</td>\n",
       "      <td>2</td>\n",
       "      <td>[hearst]</td>\n",
       "      <td>1</td>\n",
       "      <td>[Hearst 's]</td>\n",
       "      <td>1</td>\n",
       "      <td>[hearst]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(hearst, 1)]</td>\n",
       "      <td>[(hearst, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'hearst': 1}</td>\n",
       "      <td>{'hearst': 1}</td>\n",
       "      <td>hearst 's hearst 's hearst s</td>\n",
       "      <td>hearst hearst hearst</td>\n",
       "      <td>[hearst]</td>\n",
       "      <td>[hearst]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156056</td>\n",
       "      <td>forced avuncular chortles</td>\n",
       "      <td>1</td>\n",
       "      <td>[forced, avuncular, chortles]</td>\n",
       "      <td>3</td>\n",
       "      <td>[forced avuncular chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[forced, avuncular, chortles]</td>\n",
       "      <td>3</td>\n",
       "      <td>[(forced, 1), (avuncular, 1), (chortles, 1)]</td>\n",
       "      <td>[(forced, 1), (avuncular, 1), (chortles, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'forced': 1, 'avuncular': 1, 'chortles': 1}</td>\n",
       "      <td>{'forced': 1, 'avuncular': 1, 'chortles': 1}</td>\n",
       "      <td>forced avuncular chortles forced avuncular cho...</td>\n",
       "      <td>forced avuncular chortles forced avuncular cho...</td>\n",
       "      <td>[forced, avuncular, chortles]</td>\n",
       "      <td>[forced, avuncular, chortles]</td>\n",
       "      <td>[forced_avuncular, avuncular_chortles]</td>\n",
       "      <td>[forced_avuncular, avuncular_chortles]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156057</td>\n",
       "      <td>avuncular chortles</td>\n",
       "      <td>3</td>\n",
       "      <td>[avuncular, chortles]</td>\n",
       "      <td>2</td>\n",
       "      <td>[avuncular chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[avuncular, chortles]</td>\n",
       "      <td>2</td>\n",
       "      <td>[(avuncular, 1), (chortles, 1)]</td>\n",
       "      <td>[(avuncular, 1), (chortles, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'avuncular': 1, 'chortles': 1}</td>\n",
       "      <td>{'avuncular': 1, 'chortles': 1}</td>\n",
       "      <td>avuncular chortles avuncular chortles avuncula...</td>\n",
       "      <td>avuncular chortles avuncular chortles avuncula...</td>\n",
       "      <td>[avuncular, chortles]</td>\n",
       "      <td>[avuncular, chortles]</td>\n",
       "      <td>[avuncular_chortles]</td>\n",
       "      <td>[avuncular_chortles]</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156058</td>\n",
       "      <td>avuncular</td>\n",
       "      <td>2</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>1</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>1</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(avuncular, 1)]</td>\n",
       "      <td>[(avuncular, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'avuncular': 1}</td>\n",
       "      <td>{'avuncular': 1}</td>\n",
       "      <td>avuncular avuncular avuncular</td>\n",
       "      <td>avuncular avuncular avuncular</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156059</td>\n",
       "      <td>chortles</td>\n",
       "      <td>2</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(chortles, 1)]</td>\n",
       "      <td>[(chortles, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'chortles': 1}</td>\n",
       "      <td>{'chortles': 1}</td>\n",
       "      <td>chortles chortles chortles</td>\n",
       "      <td>chortles chortles chortles</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>154050 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                        0  labels  \\\n",
       "0       A series of escapades demonstrating the adage ...       1   \n",
       "1       A series of escapades demonstrating the adage ...       2   \n",
       "2                                                A series       2   \n",
       "3                                                       A       2   \n",
       "4                                                  series       2   \n",
       "...                                                   ...     ...   \n",
       "156055                                          Hearst 's       2   \n",
       "156056                          forced avuncular chortles       1   \n",
       "156057                                 avuncular chortles       3   \n",
       "156058                                          avuncular       2   \n",
       "156059                                           chortles       2   \n",
       "\n",
       "                                                   tokens  num_tokens  \\\n",
       "0       [a, series, of, escapades, demonstrating, the,...          35   \n",
       "1       [a, series, of, escapades, demonstrating, the,...          14   \n",
       "2                                             [a, series]           2   \n",
       "3                                                     [a]           1   \n",
       "4                                                [series]           1   \n",
       "...                                                   ...         ...   \n",
       "156055                                           [hearst]           1   \n",
       "156056                      [forced, avuncular, chortles]           3   \n",
       "156057                              [avuncular, chortles]           2   \n",
       "156058                                        [avuncular]           1   \n",
       "156059                                         [chortles]           1   \n",
       "\n",
       "                                                sentences  num_sentences  \\\n",
       "0       [A series of escapades demonstrating the adage...              1   \n",
       "1       [A series of escapades demonstrating the adage...              1   \n",
       "2                                              [A series]              1   \n",
       "3                                                     [A]              1   \n",
       "4                                                [series]              1   \n",
       "...                                                   ...            ...   \n",
       "156055                                        [Hearst 's]              1   \n",
       "156056                        [forced avuncular chortles]              1   \n",
       "156057                               [avuncular chortles]              1   \n",
       "156058                                        [avuncular]              1   \n",
       "156059                                         [chortles]              1   \n",
       "\n",
       "                                                    no_sw  num_no_sw  \\\n",
       "0       [series, escapades, demonstrating, adage, good...         15   \n",
       "1       [series, escapades, demonstrating, adage, good...          6   \n",
       "2                                                [series]          1   \n",
       "3                                                      []          0   \n",
       "4                                                [series]          1   \n",
       "...                                                   ...        ...   \n",
       "156055                                           [hearst]          1   \n",
       "156056                      [forced, avuncular, chortles]          3   \n",
       "156057                              [avuncular, chortles]          2   \n",
       "156058                                        [avuncular]          1   \n",
       "156059                                         [chortles]          1   \n",
       "\n",
       "                                           topwords_unfil  \\\n",
       "0       [(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)...   \n",
       "1       [(the, 2), (a, 1), (series, 1), (of, 1), (esca...   \n",
       "2                                   [(a, 1), (series, 1)]   \n",
       "3                                                [(a, 1)]   \n",
       "4                                           [(series, 1)]   \n",
       "...                                                   ...   \n",
       "156055                                      [(hearst, 1)]   \n",
       "156056       [(forced, 1), (avuncular, 1), (chortles, 1)]   \n",
       "156057                    [(avuncular, 1), (chortles, 1)]   \n",
       "156058                                   [(avuncular, 1)]   \n",
       "156059                                    [(chortles, 1)]   \n",
       "\n",
       "                                             topwords_fil  ... v_pos_fd  \\\n",
       "0       [(good, 2), (series, 1), (escapades, 1), (demo...  ...    0.307   \n",
       "1       [(series, 1), (escapades, 1), (demonstrating, ...  ...    0.367   \n",
       "2                                           [(series, 1)]  ...    0.000   \n",
       "3                                                      []  ...    0.000   \n",
       "4                                           [(series, 1)]  ...    0.000   \n",
       "...                                                   ...  ...      ...   \n",
       "156055                                      [(hearst, 1)]  ...    0.000   \n",
       "156056       [(forced, 1), (avuncular, 1), (chortles, 1)]  ...    0.000   \n",
       "156057                    [(avuncular, 1), (chortles, 1)]  ...    0.000   \n",
       "156058                                   [(avuncular, 1)]  ...    0.000   \n",
       "156059                                    [(chortles, 1)]  ...    0.000   \n",
       "\n",
       "                                                      bow  \\\n",
       "0       {'a': 2, 'series': 1, 'of': 4, 'escapades': 1,...   \n",
       "1       {'a': 1, 'series': 1, 'of': 1, 'escapades': 1,...   \n",
       "2                                   {'a': 1, 'series': 1}   \n",
       "3                                                {'a': 1}   \n",
       "4                                           {'series': 1}   \n",
       "...                                                   ...   \n",
       "156055                                      {'hearst': 1}   \n",
       "156056       {'forced': 1, 'avuncular': 1, 'chortles': 1}   \n",
       "156057                    {'avuncular': 1, 'chortles': 1}   \n",
       "156058                                   {'avuncular': 1}   \n",
       "156059                                    {'chortles': 1}   \n",
       "\n",
       "                                                 bow_nosw  \\\n",
       "0       {'series': 1, 'escapades': 1, 'demonstrating':...   \n",
       "1       {'series': 1, 'escapades': 1, 'demonstrating':...   \n",
       "2                                           {'series': 1}   \n",
       "3                                                      {}   \n",
       "4                                           {'series': 1}   \n",
       "...                                                   ...   \n",
       "156055                                      {'hearst': 1}   \n",
       "156056       {'forced': 1, 'avuncular': 1, 'chortles': 1}   \n",
       "156057                    {'avuncular': 1, 'chortles': 1}   \n",
       "156058                                   {'avuncular': 1}   \n",
       "156059                                    {'chortles': 1}   \n",
       "\n",
       "                                              diy_cleaner  \\\n",
       "0       a series of escapades demonstrating the adage ...   \n",
       "1       a series of escapades demonstrating the adage ...   \n",
       "2                              a series a series a series   \n",
       "3                                                   a a a   \n",
       "4                                    series series series   \n",
       "...                                                   ...   \n",
       "156055                       hearst 's hearst 's hearst s   \n",
       "156056  forced avuncular chortles forced avuncular cho...   \n",
       "156057  avuncular chortles avuncular chortles avuncula...   \n",
       "156058                      avuncular avuncular avuncular   \n",
       "156059                         chortles chortles chortles   \n",
       "\n",
       "                                                   pruned  \\\n",
       "0       series escapades demonstrating adage that what...   \n",
       "1       series escapades demonstrating adage that what...   \n",
       "2                                    series series series   \n",
       "3                                                           \n",
       "4                                    series series series   \n",
       "...                                                   ...   \n",
       "156055                               hearst hearst hearst   \n",
       "156056  forced avuncular chortles forced avuncular cho...   \n",
       "156057  avuncular chortles avuncular chortles avuncula...   \n",
       "156058                      avuncular avuncular avuncular   \n",
       "156059                         chortles chortles chortles   \n",
       "\n",
       "                                                nltk_negs  \\\n",
       "0       [a, series, of, escapades, demonstrating, the,...   \n",
       "1       [a, series, of, escapades, demonstrating, the,...   \n",
       "2                                             [a, series]   \n",
       "3                                                     [a]   \n",
       "4                                                [series]   \n",
       "...                                                   ...   \n",
       "156055                                           [hearst]   \n",
       "156056                      [forced, avuncular, chortles]   \n",
       "156057                              [avuncular, chortles]   \n",
       "156058                                        [avuncular]   \n",
       "156059                                         [chortles]   \n",
       "\n",
       "                                            unigram_feats  \\\n",
       "0       [the, of, is, good, for, of_NEG, a, series, es...   \n",
       "1       [the, a, series, of, escapades, demonstrating,...   \n",
       "2                                             [a, series]   \n",
       "3                                                     [a]   \n",
       "4                                                [series]   \n",
       "...                                                   ...   \n",
       "156055                                           [hearst]   \n",
       "156056                      [forced, avuncular, chortles]   \n",
       "156057                              [avuncular, chortles]   \n",
       "156058                                        [avuncular]   \n",
       "156059                                         [chortles]   \n",
       "\n",
       "                                             bigram_feats  \\\n",
       "0       [a_series, series_of, of_escapades, escapades_...   \n",
       "1       [a_series, series_of, of_escapades, escapades_...   \n",
       "2                                              [a_series]   \n",
       "3                                                      []   \n",
       "4                                                      []   \n",
       "...                                                   ...   \n",
       "156055                                                 []   \n",
       "156056             [forced_avuncular, avuncular_chortles]   \n",
       "156057                               [avuncular_chortles]   \n",
       "156058                                                 []   \n",
       "156059                                                 []   \n",
       "\n",
       "                                         bigram_feats_neg PoN  \n",
       "0       [a_series, series_of, of_escapades, escapades_...   1  \n",
       "1       [a_series, series_of, of_escapades, escapades_...   2  \n",
       "2                                              [a_series]   2  \n",
       "3                                                      []   2  \n",
       "4                                                      []   2  \n",
       "...                                                   ...  ..  \n",
       "156055                                                 []   2  \n",
       "156056             [forced_avuncular, avuncular_chortles]   1  \n",
       "156057                               [avuncular_chortles]   3  \n",
       "156058                                                 []   2  \n",
       "156059                                                 []   2  \n",
       "\n",
       "[154050 rows x 40 columns]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df.to_csv('hw7_data_sentiment.csv',index=False)\n",
    "all_df['PoN'] = all_df['labels']\n",
    "all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores -- Gaussian\n",
      "Accuracy: 0.5212593313859136\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 4)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-8-23ce805ce724>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0msmall_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mall_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'v_compound'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'v_pos'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'v_neg'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'v_neu'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_NB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msmall_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mall_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PoN'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGaussianNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Vader Scores -- Gaussian'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0;31m# display_NB_tables(tables)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-1-53b5ce53451f>\u001b[0m in \u001b[0;36mget_NB\u001b[0;34m(small_df, labels, classifier, title)\u001b[0m\n\u001b[1;32m    135\u001b[0m         \u001b[0mcm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m         \u001b[0;31m# confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m         \u001b[0mtn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    138\u001b[0m         df = pd.DataFrame({ 'labels': ['accuracy','true neg', 'false pos', 'false neg', 'true pos'], \n\u001b[1;32m    139\u001b[0m                            'nums': [accuracy, tn, fp, fn, tp] })\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 4)"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu'])\n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores -- Gaussian')\n",
    "# display_NB_tables(tables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Positive Vader Scores -- Multinomial\n",
      "Accuracy: 0.5118684409823651\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 4)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-9-3d098b38ddd9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0msmall_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mall_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'v_pos'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'v_neu'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_NB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msmall_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mall_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PoN'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mMultinomialNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Positive Vader Scores -- Multinomial'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-1-53b5ce53451f>\u001b[0m in \u001b[0;36mget_NB\u001b[0;34m(small_df, labels, classifier, title)\u001b[0m\n\u001b[1;32m    135\u001b[0m         \u001b[0mcm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m         \u001b[0;31m# confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m         \u001b[0mtn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    138\u001b[0m         df = pd.DataFrame({ 'labels': ['accuracy','true neg', 'false pos', 'false neg', 'true pos'], \n\u001b[1;32m    139\u001b[0m                            'nums': [accuracy, tn, fp, fn, tp] })\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 4)"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_pos','v_neu'])\n",
    "tables = get_NB(small_df, all_df['PoN'], MultinomialNB(), 'Positive Vader Scores -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores from Summary -- Gaussian\n",
      "Accuracy: 0.5090987774532079\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 4)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-10-fee9a940fcf0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0msmall_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mall_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'v_compound_sum'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'v_pos_sum'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'v_neg_sum'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'v_neu_sum'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_NB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msmall_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mall_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PoN'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGaussianNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Vader Scores from Summary -- Gaussian'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-1-53b5ce53451f>\u001b[0m in \u001b[0;36mget_NB\u001b[0;34m(small_df, labels, classifier, title)\u001b[0m\n\u001b[1;32m    135\u001b[0m         \u001b[0mcm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m         \u001b[0;31m# confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m         \u001b[0mtn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    138\u001b[0m         df = pd.DataFrame({ 'labels': ['accuracy','true neg', 'false pos', 'false neg', 'true pos'], \n\u001b[1;32m    139\u001b[0m                            'nums': [accuracy, tn, fp, fn, tp] })\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 4)"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) \n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores from Summary -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores (original) and Vader Scores (summary) -- Gaussian\n",
      "Accuracy: 0.4752569512063183\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 4)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-11-b40a6387af11>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n\u001b[1;32m      2\u001b[0m                           'v_compound','v_pos', 'v_neg', 'v_neu'])\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_NB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msmall_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mall_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PoN'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGaussianNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Vader Scores (original) and Vader Scores (summary) -- Gaussian'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-1-53b5ce53451f>\u001b[0m in \u001b[0;36mget_NB\u001b[0;34m(small_df, labels, classifier, title)\u001b[0m\n\u001b[1;32m    135\u001b[0m         \u001b[0mcm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m         \u001b[0;31m# confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m         \u001b[0mtn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    138\u001b[0m         df = pd.DataFrame({ 'labels': ['accuracy','true neg', 'false pos', 'false neg', 'true pos'], \n\u001b[1;32m    139\u001b[0m                            'nums': [accuracy, tn, fp, fn, tp] })\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 4)"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n",
    "                          'v_compound','v_pos', 'v_neg', 'v_neu'])\n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores (original) and Vader Scores (summary) -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores 50 most frequent filtered words -- Gaussian\n",
      "Accuracy: 0.5117602510007573\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 4)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-12-1aa2be201ad4>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0msmall_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mall_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfilter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'v_compound_fd'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'v_pos_fd'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'v_neu_fd'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'v_neg_fd'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_NB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msmall_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mall_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'PoN'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGaussianNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Vader Scores 50 most frequent filtered words -- Gaussian'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-1-53b5ce53451f>\u001b[0m in \u001b[0;36mget_NB\u001b[0;34m(small_df, labels, classifier, title)\u001b[0m\n\u001b[1;32m    135\u001b[0m         \u001b[0mcm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m         \u001b[0;31m# confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m         \u001b[0mtn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    138\u001b[0m         df = pd.DataFrame({ 'labels': ['accuracy','true neg', 'false pos', 'false neg', 'true pos'], \n\u001b[1;32m    139\u001b[0m                            'nums': [accuracy, tn, fp, fn, tp] })\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 4)"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd'])\n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores 50 most frequent filtered words -- Gaussian')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bag of Words & Machine Learning "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting point -- Gaussian\n",
      "Accuracy: 0.18355512279562913\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 4)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-16-44361492ad3c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      4\u001b[0m \u001b[0mnew_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnew_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfillna\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mnew_df\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_NB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnew_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGaussianNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Starting point -- Gaussian'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-1-53b5ce53451f>\u001b[0m in \u001b[0;36mget_NB\u001b[0;34m(small_df, labels, classifier, title)\u001b[0m\n\u001b[1;32m    135\u001b[0m         \u001b[0mcm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m         \u001b[0;31m# confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m         \u001b[0mtn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    138\u001b[0m         df = pd.DataFrame({ 'labels': ['accuracy','true neg', 'false pos', 'false neg', 'true pos'], \n\u001b[1;32m    139\u001b[0m                            'nums': [accuracy, tn, fp, fn, tp] })\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 4)"
     ]
    }
   ],
   "source": [
    "all_df['bow_v1'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "all_df\n",
    "new_df = pd.DataFrame(all_df['bow_v1'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Starting point -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting point -- Gaussian\n",
      "Accuracy: 0.18355512279562913\n"
     ]
    },
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 4)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-17-132afb5c5ab9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtables\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_NB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnew_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnew_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mGaussianNB\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'Starting point -- Gaussian'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
      "\u001b[0;32m<ipython-input-1-53b5ce53451f>\u001b[0m in \u001b[0;36mget_NB\u001b[0;34m(small_df, labels, classifier, title)\u001b[0m\n\u001b[1;32m    135\u001b[0m         \u001b[0mcm\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mconfusion_matrix\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    136\u001b[0m         \u001b[0;31m# confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 137\u001b[0;31m         \u001b[0mtn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfn\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtp\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mcm\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mravel\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    138\u001b[0m         df = pd.DataFrame({ 'labels': ['accuracy','true neg', 'false pos', 'false neg', 'true pos'], \n\u001b[1;32m    139\u001b[0m                            'nums': [accuracy, tn, fp, fn, tp] })\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 4)"
     ]
    }
   ],
   "source": [
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Starting point -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v1'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v1'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Starting point -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Starting point -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'DIY Cleaner -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'DIY Cleaner -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'DIY Cleaner -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Pruned Words -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Pruned Words -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Pruned Words -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'NLTK negs -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'NLTK negs -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'NLTK negs -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Bigram Feats -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Bigram Feats -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Bigram Feats -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'No Shared Words -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'No Shared Words -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'No Shared Words -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_cloud_with_mask('yellow_square.png', big_bow, 750, \"Top Words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_cloud_with_mask('red_square.png', big_bow_n, 750, \"Top Negative Words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_cloud_with_mask('green_square.png', big_bow_p, 750, \"Top Positive Words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n",
    "\n",
    "def runPipeline(classifier, boolean, cv, X, y):\n",
    "    nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=boolean)),('nb', classifier)])\n",
    "    scores = cross_val_score(nb_clf_pipe, X, y, cv=cv)\n",
    "    avg=sum(scores)/len(scores)\n",
    "#     pretty_line = \"{} | Accuracy using {} -- and booleans? {}\"\n",
    "    pretty_line = \"{} | B? {} | CV: {} | Classifier: {}\"\n",
    "    print(pretty_line.format(avg, str(boolean)[0], cv, str(classifier).split('(')[0]))\n",
    "\n",
    "# X = array of data\n",
    "# y = array of labels\n",
    "\n",
    "hw6 = all_df[[0,'PoN']]\n",
    "X = hw6[0].tolist()\n",
    "y = hw6['PoN'].tolist()\n",
    "\n",
    "runPipeline(BernoulliNB(), False, 5, X=X, y=y)\n",
    "runPipeline(BernoulliNB(), False, 3, X=X, y=y)\n",
    "runPipeline(MultinomialNB(), False, 5, X=X, y=y)\n",
    "runPipeline(MultinomialNB(), False, 3, X=X, y=y)\n",
    "runPipeline(MultinomialNB(), True, 5,  X=X, y=y)\n",
    "runPipeline(MultinomialNB(), True, 3,  X=X, y=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tabulate import tabulate\n",
    "\n",
    "df = hw6\n",
    "def shorten(long_string):\n",
    "    return long_string[:1] if len(long_string) < 21 else long_string[:20]\n",
    "\n",
    "def df_for_tabulate(df, column):\n",
    "    pretty_df = df.copy()\n",
    "    pretty_df[column] = pretty_df.apply(lambda x: shorten(x[column]), axis = 1)\n",
    "    return pretty_df\n",
    "    \n",
    "tabulate_df = df_for_tabulate(df, 0)\n",
    "print(tabulate(tabulate_df[:10], tablefmt=\"simple\", headers=tabulate_df.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
