{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW4 PIPELINE + HW6 + HW7 + HW8 (Topic Modeling)\n",
    "## Building off HW2 + HW3"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "## =======================================================\n",
    "## IMPORTING\n",
    "## =======================================================\n",
    "import os\n",
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file)\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "## =======================================================\n",
    "## TOKENIZING\n",
    "## =======================================================\n",
    "from nltk.tokenize import word_tokenize, sent_tokenize\n",
    "def get_tokens(sentence):\n",
    "    tokens = word_tokenize(sentence)\n",
    "    clean_tokens = [word.lower() for word in tokens if word.isalpha()]\n",
    "    return clean_tokens\n",
    "\n",
    "def get_sentence_tokens(review):\n",
    "    return sent_tokenize(review)\n",
    "\n",
    "## =======================================================\n",
    "## REMOVING STOPWORDS\n",
    "## =======================================================\n",
    "from nltk.corpus import stopwords\n",
    "stop_words = set(stopwords.words(\"english\"))\n",
    "def remove_stopwords(sentence):\n",
    "    filtered_text = []\n",
    "    for word in sentence:\n",
    "        if word not in stop_words:\n",
    "            filtered_text.append(word)\n",
    "    return filtered_text\n",
    "\n",
    "## =======================================================\n",
    "## FREQUENCY DISTRIBUTIONS\n",
    "## =======================================================\n",
    "from nltk.probability import FreqDist\n",
    "def get_most_common(tokens):\n",
    "    fdist = FreqDist(tokens)\n",
    "    return fdist.most_common(12)\n",
    "\n",
    "def get_most_common(tokens):\n",
    "    fdist = FreqDist(tokens)\n",
    "    return fdist.most_common(12)\n",
    "\n",
    "def get_fdist(tokens):\n",
    "    return (FreqDist(tokens))\n",
    "\n",
    "## =======================================================\n",
    "## SENTIMENT ANALYSIS\n",
    "## =======================================================\n",
    "from nltk.sentiment import SentimentAnalyzer\n",
    "from nltk.sentiment.util import *\n",
    "from nltk.sentiment.vader import SentimentIntensityAnalyzer\n",
    "sid = SentimentIntensityAnalyzer()\n",
    "\n",
    "def get_vader_score(review):\n",
    "    return sid.polarity_scores(review)\n",
    "\n",
    "def separate_vader_score(vader_score, key):\n",
    "    return vader_score[key]\n",
    "\n",
    "## =======================================================\n",
    "## SUMMARIZER\n",
    "## =======================================================\n",
    "def get_weighted_freq_dist(review, freq_dist):\n",
    "    try:\n",
    "        max_freq = max(freq_dist.values())\n",
    "        for word in freq_dist.keys():\n",
    "            freq_dist[word] = (freq_dist[word]/max_freq)\n",
    "        return freq_dist\n",
    "    except:\n",
    "        for word in freq_dist.keys():\n",
    "            freq_dist[word] = (freq_dist[word]/1)\n",
    "        return freq_dist\n",
    "        \n",
    "\n",
    "def get_sentence_score(review, freq_dist):\n",
    "    sentence_scores = {}\n",
    "    for sent in review:\n",
    "        for word in nltk.word_tokenize(sent.lower()):\n",
    "            if word in freq_dist.keys():\n",
    "                if len(sent.split(' ')) < 30:\n",
    "                    if sent not in sentence_scores.keys():\n",
    "                        sentence_scores[sent] = freq_dist[word]\n",
    "                    else:\n",
    "                        sentence_scores[sent] += freq_dist[word]\n",
    "    return sentence_scores\n",
    "\n",
    "def get_summary_sentences(sentence_scores):\n",
    "    sorted_sentences = sorted(sentence_scores.items(), key=lambda kv: kv[1], reverse=True)\n",
    "    return ''.join(sent[0] for sent in sorted_sentences[:5])\n",
    "\n",
    "def get_freq_words(freq_dist):\n",
    "    sorted_words = sorted(freq_dist.items(), key=lambda kv: kv[1], reverse=True)\n",
    "    return ' '.join(word[0] for word in sorted_words[:50])\n",
    "\n",
    "## =======================================================\n",
    "## MACHINE LEARNING -- NAIVE BAYES\n",
    "## =======================================================\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.naive_bayes import GaussianNB, MultinomialNB\n",
    "from sklearn import metrics\n",
    "from sklearn.metrics import confusion_matrix, classification_report\n",
    "\n",
    "# def get_NB(small_df, labels):\n",
    "#     x_train, x_test, y_train, y_test = train_test_split(small_df.values, labels, test_size=0.3, random_state = 109)\n",
    "\n",
    "#     gnb = GaussianNB()\n",
    "#     gnb.fit(x_train, y_train)\n",
    "#     y_pred = gnb.predict(x_test)\n",
    "#     print(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))\n",
    "    \n",
    "    \n",
    "def get_NB(small_df, labels, classifier, title):\n",
    "    seeds = [109, 210, 420, 19, 7]\n",
    "    dfs = []\n",
    "    overall = []\n",
    "    print(title)\n",
    "    for seed in seeds:\n",
    "        x_train, x_test, y_train, y_test = train_test_split(small_df.values, \n",
    "                                                            labels, test_size=0.3, random_state = seed)\n",
    "        gnb = classifier\n",
    "        gnb.fit(x_train, y_train).score(x_train, y_train)\n",
    "        y_pred = gnb.predict(x_test)\n",
    "        accuracy =  metrics.accuracy_score(y_test, y_pred)\n",
    "        report = metrics.classification_report(y_test, y_pred)\n",
    "        print(\"Accuracy:\", accuracy)\n",
    "#         print(report)\n",
    "        overall.append(accuracy)\n",
    "        cm = confusion_matrix(y_test, y_pred)\n",
    "        # confusion_matrix_graph(cm, accuracy, \"NB Multinomial Tokenized\")\n",
    "#         t0, fp, fn, tp = cm.ravel()\n",
    "        print(cm.ravel())\n",
    "        df = pd.DataFrame(cm.ravel())\n",
    "        dfs.append(df)\n",
    "    print('AVERAGE ACCURACY:', sum(overall)/len(overall))\n",
    "    return dfs\n",
    "\n",
    "\n",
    "def display_NB_tables(dfs):\n",
    "    for df in dfs:\n",
    "        print(display(df))\n",
    "        \n",
    "## =======================================================\n",
    "## PLOTS\n",
    "## =======================================================        \n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt \n",
    "def bar_plot(df, title): \n",
    "    graph = sns.barplot(y = \"count\", x = \"word\", data = df, palette = \"husl\")\n",
    "    plt.title(title)\n",
    "    plt.xlabel(\"Word\")\n",
    "    plt.ylabel(\"Count\")\n",
    "    sns.set_context(\"talk\")\n",
    "    plt.xticks(rotation = 90)\n",
    "    return plt\n",
    "\n",
    "from nltk.tokenize import casual_tokenize\n",
    "from collections import Counter\n",
    "\n",
    "\n",
    "## =======================================================\n",
    "## CLEANERS\n",
    "## =======================================================   \n",
    "import re, string\n",
    "def diy_cleaner(review):\n",
    "    try:\n",
    "        both = review.split('\\n')\n",
    "        title = both[0]\n",
    "        review = both[1]\n",
    "        review = review.replace(\"'\",\"\")\n",
    "    except:\n",
    "        review = review.replace(\"'\",\"\")\n",
    "    pattern = re.compile('[\\W_]+')\n",
    "    review = pattern.sub(' ', review)\n",
    "    cleaned = title + ' ' + title + ' ' + review\n",
    "    return cleaned.lower()\n",
    "\n",
    "def pruner(review):\n",
    "    clean_review = ' '.join([word for word in review.split() if len(word) > 3])\n",
    "    return clean_review\n",
    "\n",
    "sentim_analyzer = SentimentAnalyzer()\n",
    "def get_nltk_negs(tokens):\n",
    "    all_words_neg = sentim_analyzer.all_words([mark_negation(tokens)])\n",
    "    return all_words_neg\n",
    "\n",
    "def get_unigram_feats(neg_tokens):\n",
    "    unigram_feats = sentim_analyzer.unigram_word_feats(neg_tokens)\n",
    "    return unigram_feats\n",
    "    \n",
    "def get_bigram_feats(tokens):\n",
    "    ngrams = zip(*[tokens[i:] for i in range(2)])\n",
    "    return [\"_\".join(ngram) for ngram in ngrams]\n",
    "\n",
    "## =======================================================\n",
    "## HELPERS\n",
    "## =======================================================  \n",
    "def get_bow_from_column(df, column):\n",
    "    all_column_data = ' '.join(df[column].tolist())\n",
    "    all_column_fd = Counter(all_column_data.split())\n",
    "    return all_column_fd\n",
    "\n",
    "def get_common_words(num):\n",
    "    most_common_neg = [word[0] for word in big_bow_n.most_common(num)]\n",
    "    most_common_pos = [word[0] for word in big_bow_p.most_common(num)]\n",
    "    in_both = np.intersect1d(most_common_neg, most_common_pos)\n",
    "    neg_notpos = np.setdiff1d(most_common_neg, most_common_pos)\n",
    "    pos_notneg = np.setdiff1d(most_common_pos, most_common_neg)\n",
    "    return [len(in_both), len(neg_notpos), len(pos_notneg), len(in_both)/num, in_both, neg_notpos, pos_notneg]\n",
    "\n",
    "def get_only_polarized(tokens, common_words):\n",
    "    return [token for token in tokens if token not in common_words[4]] # 70\n",
    "\n",
    "## =======================================================\n",
    "## VISUALS\n",
    "## =======================================================  \n",
    "# import wordcloud\n",
    "# from wordcloud import WordCloud, ImageColorGenerator\n",
    "# from PIL import Image\n",
    "# import seaborn as sns\n",
    "# import matplotlib.pyplot as plt \n",
    "# def create_word_cloud_with_mask(path_of_mask_image, dictionary, \n",
    "#                                 max_num_words, title):\n",
    "#         mask = np.array(Image.open(path_of_mask_image))\n",
    "#         word_cloud = WordCloud(background_color = \"white\", \n",
    "#                                max_words = max_num_words, \n",
    "#                                mask = mask, max_font_size = 125, \n",
    "#                                random_state = 1006)\n",
    "#         word_cloud.generate_from_frequencies(dictionary)\n",
    "#         image_colors = ImageColorGenerator(mask)\n",
    "#         plt.figure(figsize = [8,8])\n",
    "#         plt.imshow(word_cloud.recolor(color_func = image_colors), interpolation = \"bilinear\")\n",
    "#         plt.title(title)\n",
    "#         sns.set_context(\"poster\")\n",
    "#         plt.axis(\"off\")\n",
    "#         return plt\n",
    "    \n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt \n",
    "def bar_plot(df, title): \n",
    "    graph = sns.barplot(y = \"count\", x = \"word\", data = df, palette = \"husl\")\n",
    "    plt.title(title)\n",
    "    plt.xlabel(\"Word\")\n",
    "    plt.ylabel(\"Count\")\n",
    "    sns.set_context(\"talk\")\n",
    "    plt.xticks(rotation = 90)\n",
    "    return plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>labels</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>series</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156055</td>\n",
       "      <td>Hearst 's</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156056</td>\n",
       "      <td>forced avuncular chortles</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156057</td>\n",
       "      <td>avuncular chortles</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156058</td>\n",
       "      <td>avuncular</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156059</td>\n",
       "      <td>chortles</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>156060 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                        0  labels\n",
       "0       A series of escapades demonstrating the adage ...       1\n",
       "1       A series of escapades demonstrating the adage ...       2\n",
       "2                                                A series       2\n",
       "3                                                       A       2\n",
       "4                                                  series       2\n",
       "...                                                   ...     ...\n",
       "156055                                          Hearst 's       2\n",
       "156056                          forced avuncular chortles       1\n",
       "156057                                 avuncular chortles       3\n",
       "156058                                          avuncular       2\n",
       "156059                                           chortles       2\n",
       "\n",
       "[156060 rows x 2 columns]"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "#########################\n",
    "## KAGGLE SENTIMENT\n",
    "#########################\n",
    "\n",
    "train=pd.read_csv(\"../WK7/kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values\n",
    "\n",
    "all_df = pd.DataFrame(X)\n",
    "all_df['labels'] = y\n",
    "all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# neg = get_data_from_files('../NEG_JK/')\n",
    "# pos = get_data_from_files('../POS_JK/')\n",
    "\n",
    "# neg = get_data_from_files('../neg_cornell/')\n",
    "# pos = get_data_from_files('../pos_cornell/')\n",
    "\n",
    "# neg = get_data_from_files('../neg_hw4/')\n",
    "# pos = get_data_from_files('../pos_hw4/')\n",
    "\n",
    "# neg = get_data_from_files('../hw4_lie_false/')\n",
    "# pos = get_data_from_files('../hw4_lie_true/')\n",
    "\n",
    "# pos = get_data_from_files('../hw4_lie_false/')\n",
    "# neg = get_data_from_files('../hw4_lie_true/')\n",
    "\n",
    "# neg_df = pd.DataFrame(neg)\n",
    "# pos_df = pd.DataFrame(pos)\n",
    "\n",
    "# pos_df['PoN'] = 'P'\n",
    "# neg_df['PoN'] = 'N'\n",
    "# all_df = neg_df.append(pos_df)\n",
    "\n",
    "all_df['tokens'] = all_df.apply(lambda x: get_tokens(x[0]), axis=1)\n",
    "all_df['num_tokens'] = all_df.apply(lambda x: len(x['tokens']), axis=1)\n",
    "\n",
    "all_df = all_df.drop(all_df[all_df.num_tokens < 1].index)\n",
    "\n",
    "all_df['sentences'] = all_df.apply(lambda x: get_sentence_tokens(x[0]), axis=1)\n",
    "all_df['num_sentences'] = all_df.apply(lambda x: len(x['sentences']), axis=1)\n",
    "\n",
    "all_df['no_sw'] = all_df.apply(lambda x: remove_stopwords(x['tokens']),axis=1)\n",
    "all_df['num_no_sw'] = all_df.apply(lambda x: len(x['no_sw']),axis=1)\n",
    "\n",
    "all_df['topwords_unfil'] = all_df.apply(lambda x: get_most_common(x['tokens']),axis=1)\n",
    "all_df['topwords_fil'] = all_df.apply(lambda x: get_most_common(x['no_sw']),axis=1)\n",
    "\n",
    "all_df['freq_dist'] = all_df.apply(lambda x: get_fdist(x['no_sw']),axis=1)\n",
    "all_df['freq_dist_unfil'] = all_df.apply(lambda x: get_fdist(x['tokens']),axis=1)\n",
    "\n",
    "all_df['vader_all'] = all_df.apply(lambda x: get_vader_score(x[0]),axis=1)\n",
    "all_df['v_compound'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'compound'),axis=1)\n",
    "all_df['v_neg'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neg'),axis=1)\n",
    "all_df['v_neu'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'neu'),axis=1)\n",
    "all_df['v_pos'] = all_df.apply(lambda x: separate_vader_score(x['vader_all'], 'pos'),axis=1)\n",
    "\n",
    "all_df['weighted_freq_dist'] = all_df.apply(lambda x: get_weighted_freq_dist(x['sentences'], x['freq_dist']),axis=1)\n",
    "all_df['sentence_scores'] = all_df.apply(lambda x: get_sentence_score(x['sentences'], x['freq_dist']),axis=1)\n",
    "all_df['summary_sentences'] = all_df.apply(lambda x: get_summary_sentences(x['sentence_scores']), axis=1)\n",
    "\n",
    "all_df['vader_sum_all'] = all_df.apply(lambda x: get_vader_score(x['summary_sentences']),axis=1)\n",
    "all_df['v_compound_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'compound'),axis=1)\n",
    "all_df['v_neg_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neg'),axis=1)\n",
    "all_df['v_neu_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'neu'),axis=1)\n",
    "all_df['v_pos_sum'] = all_df.apply(lambda x: separate_vader_score(x['vader_sum_all'], 'pos'),axis=1)\n",
    "\n",
    "all_df['v_freq_words'] = all_df.apply(lambda x: get_freq_words(x['freq_dist']), axis=1)\n",
    "\n",
    "all_df['vader_fq_all'] = all_df.apply(lambda x: get_vader_score(x['v_freq_words']),axis=1)\n",
    "all_df['v_compound_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'compound'),axis=1)\n",
    "all_df['v_neg_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neg'),axis=1)\n",
    "all_df['v_neu_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'neu'),axis=1)\n",
    "all_df['v_pos_fd'] = all_df.apply(lambda x: separate_vader_score(x['vader_fq_all'], 'pos'),axis=1)\n",
    "\n",
    "all_df['bow'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "all_df['bow_nosw'] = all_df.apply(lambda x: Counter(x['no_sw']), axis=1)\n",
    "\n",
    "all_df['diy_cleaner'] = all_df.apply(lambda x: diy_cleaner(x[0]), axis=1)\n",
    "all_df['pruned'] = all_df.apply(lambda x: pruner(x['diy_cleaner']), axis=1)\n",
    "\n",
    "all_df['nltk_negs'] = all_df.apply(lambda x: get_nltk_negs(x['tokens']), axis=1)\n",
    "all_df['unigram_feats'] = all_df.apply(lambda x: get_unigram_feats(x['nltk_negs']), axis=1)\n",
    "all_df['bigram_feats'] = all_df.apply(lambda x: get_bigram_feats(x['tokens']), axis=1)\n",
    "all_df['bigram_feats_neg'] = all_df.apply(lambda x: get_bigram_feats(x['nltk_negs']), axis=1)\n",
    "\n",
    "big_bow = get_bow_from_column(all_df, 'pruned')\n",
    "big_bow_1 = get_bow_from_column(all_df[all_df['labels'] == 1], 'pruned')\n",
    "big_bow_2 = get_bow_from_column(all_df[all_df['labels'] == 2], 'pruned')\n",
    "big_bow_3 = get_bow_from_column(all_df[all_df['labels'] == 3], 'pruned')\n",
    "big_bow_4 = get_bow_from_column(all_df[all_df['labels'] == 4], 'pruned')\n",
    "big_bow_5 = get_bow_from_column(all_df[all_df['labels'] == 5], 'pruned')\n",
    "\n",
    "# most_common_1 = [word[0] for word in big_bow_n.most_common(100)]\n",
    "# most_common_2 = [word[0] for word in big_bow_p.most_common(100)]\n",
    "\n",
    "\n",
    "\n",
    "# all_df['no_shared_words'] = all_df.apply(lambda x: get_only_polarized(x['tokens'], get_common_words(500)), axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>labels</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>topwords_unfil</th>\n",
       "      <th>topwords_fil</th>\n",
       "      <th>...</th>\n",
       "      <th>v_neu_fd</th>\n",
       "      <th>v_pos_fd</th>\n",
       "      <th>bow</th>\n",
       "      <th>bow_nosw</th>\n",
       "      <th>diy_cleaner</th>\n",
       "      <th>pruned</th>\n",
       "      <th>nltk_negs</th>\n",
       "      <th>unigram_feats</th>\n",
       "      <th>bigram_feats</th>\n",
       "      <th>bigram_feats_neg</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>35</td>\n",
       "      <td>[A series of escapades demonstrating the adage...</td>\n",
       "      <td>1</td>\n",
       "      <td>[series, escapades, demonstrating, adage, good...</td>\n",
       "      <td>15</td>\n",
       "      <td>[(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)...</td>\n",
       "      <td>[(good, 2), (series, 1), (escapades, 1), (demo...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.693</td>\n",
       "      <td>0.307</td>\n",
       "      <td>{'a': 2, 'series': 1, 'of': 4, 'escapades': 1,...</td>\n",
       "      <td>{'series': 1, 'escapades': 1, 'demonstrating':...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>[the, of, is, good, for, of_NEG, a, series, es...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>14</td>\n",
       "      <td>[A series of escapades demonstrating the adage...</td>\n",
       "      <td>1</td>\n",
       "      <td>[series, escapades, demonstrating, adage, good...</td>\n",
       "      <td>6</td>\n",
       "      <td>[(the, 2), (a, 1), (series, 1), (of, 1), (esca...</td>\n",
       "      <td>[(series, 1), (escapades, 1), (demonstrating, ...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.633</td>\n",
       "      <td>0.367</td>\n",
       "      <td>{'a': 1, 'series': 1, 'of': 1, 'escapades': 1,...</td>\n",
       "      <td>{'series': 1, 'escapades': 1, 'demonstrating':...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>[the, a, series, of, escapades, demonstrating,...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>2</td>\n",
       "      <td>[A series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(a, 1), (series, 1)]</td>\n",
       "      <td>[(series, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>1.000</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'a': 1, 'series': 1}</td>\n",
       "      <td>{'series': 1}</td>\n",
       "      <td>a series a series a series</td>\n",
       "      <td>series series series</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>[a_series]</td>\n",
       "      <td>[a_series]</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3 rows × 39 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0  labels  \\\n",
       "0  A series of escapades demonstrating the adage ...       1   \n",
       "1  A series of escapades demonstrating the adage ...       2   \n",
       "2                                           A series       2   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0  [a, series, of, escapades, demonstrating, the,...          35   \n",
       "1  [a, series, of, escapades, demonstrating, the,...          14   \n",
       "2                                        [a, series]           2   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0  [A series of escapades demonstrating the adage...              1   \n",
       "1  [A series of escapades demonstrating the adage...              1   \n",
       "2                                         [A series]              1   \n",
       "\n",
       "                                               no_sw  num_no_sw  \\\n",
       "0  [series, escapades, demonstrating, adage, good...         15   \n",
       "1  [series, escapades, demonstrating, adage, good...          6   \n",
       "2                                           [series]          1   \n",
       "\n",
       "                                      topwords_unfil  \\\n",
       "0  [(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)...   \n",
       "1  [(the, 2), (a, 1), (series, 1), (of, 1), (esca...   \n",
       "2                              [(a, 1), (series, 1)]   \n",
       "\n",
       "                                        topwords_fil  ... v_neu_fd v_pos_fd  \\\n",
       "0  [(good, 2), (series, 1), (escapades, 1), (demo...  ...    0.693    0.307   \n",
       "1  [(series, 1), (escapades, 1), (demonstrating, ...  ...    0.633    0.367   \n",
       "2                                      [(series, 1)]  ...    1.000    0.000   \n",
       "\n",
       "                                                 bow  \\\n",
       "0  {'a': 2, 'series': 1, 'of': 4, 'escapades': 1,...   \n",
       "1  {'a': 1, 'series': 1, 'of': 1, 'escapades': 1,...   \n",
       "2                              {'a': 1, 'series': 1}   \n",
       "\n",
       "                                            bow_nosw  \\\n",
       "0  {'series': 1, 'escapades': 1, 'demonstrating':...   \n",
       "1  {'series': 1, 'escapades': 1, 'demonstrating':...   \n",
       "2                                      {'series': 1}   \n",
       "\n",
       "                                         diy_cleaner  \\\n",
       "0  a series of escapades demonstrating the adage ...   \n",
       "1  a series of escapades demonstrating the adage ...   \n",
       "2                         a series a series a series   \n",
       "\n",
       "                                              pruned  \\\n",
       "0  series escapades demonstrating adage that what...   \n",
       "1  series escapades demonstrating adage that what...   \n",
       "2                               series series series   \n",
       "\n",
       "                                           nltk_negs  \\\n",
       "0  [a, series, of, escapades, demonstrating, the,...   \n",
       "1  [a, series, of, escapades, demonstrating, the,...   \n",
       "2                                        [a, series]   \n",
       "\n",
       "                                       unigram_feats  \\\n",
       "0  [the, of, is, good, for, of_NEG, a, series, es...   \n",
       "1  [the, a, series, of, escapades, demonstrating,...   \n",
       "2                                        [a, series]   \n",
       "\n",
       "                                        bigram_feats  \\\n",
       "0  [a_series, series_of, of_escapades, escapades_...   \n",
       "1  [a_series, series_of, of_escapades, escapades_...   \n",
       "2                                         [a_series]   \n",
       "\n",
       "                                    bigram_feats_neg  \n",
       "0  [a_series, series_of, of_escapades, escapades_...  \n",
       "1  [a_series, series_of, of_escapades, escapades_...  \n",
       "2                                         [a_series]  \n",
       "\n",
       "[3 rows x 39 columns]"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df[:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SAVE TO CSV!"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>labels</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>topwords_unfil</th>\n",
       "      <th>topwords_fil</th>\n",
       "      <th>...</th>\n",
       "      <th>v_pos_fd</th>\n",
       "      <th>bow</th>\n",
       "      <th>bow_nosw</th>\n",
       "      <th>diy_cleaner</th>\n",
       "      <th>pruned</th>\n",
       "      <th>nltk_negs</th>\n",
       "      <th>unigram_feats</th>\n",
       "      <th>bigram_feats</th>\n",
       "      <th>bigram_feats_neg</th>\n",
       "      <th>PoN</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>35</td>\n",
       "      <td>[A series of escapades demonstrating the adage...</td>\n",
       "      <td>1</td>\n",
       "      <td>[series, escapades, demonstrating, adage, good...</td>\n",
       "      <td>15</td>\n",
       "      <td>[(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)...</td>\n",
       "      <td>[(good, 2), (series, 1), (escapades, 1), (demo...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.307</td>\n",
       "      <td>{'a': 2, 'series': 1, 'of': 4, 'escapades': 1,...</td>\n",
       "      <td>{'series': 1, 'escapades': 1, 'demonstrating':...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>[the, of, is, good, for, of_NEG, a, series, es...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>14</td>\n",
       "      <td>[A series of escapades demonstrating the adage...</td>\n",
       "      <td>1</td>\n",
       "      <td>[series, escapades, demonstrating, adage, good...</td>\n",
       "      <td>6</td>\n",
       "      <td>[(the, 2), (a, 1), (series, 1), (of, 1), (esca...</td>\n",
       "      <td>[(series, 1), (escapades, 1), (demonstrating, ...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.367</td>\n",
       "      <td>{'a': 1, 'series': 1, 'of': 1, 'escapades': 1,...</td>\n",
       "      <td>{'series': 1, 'escapades': 1, 'demonstrating':...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>[a, series, of, escapades, demonstrating, the,...</td>\n",
       "      <td>[the, a, series, of, escapades, demonstrating,...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>[a_series, series_of, of_escapades, escapades_...</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>2</td>\n",
       "      <td>[A series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(a, 1), (series, 1)]</td>\n",
       "      <td>[(series, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'a': 1, 'series': 1}</td>\n",
       "      <td>{'series': 1}</td>\n",
       "      <td>a series a series a series</td>\n",
       "      <td>series series series</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>[a, series]</td>\n",
       "      <td>[a_series]</td>\n",
       "      <td>[a_series]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "      <td>[a]</td>\n",
       "      <td>1</td>\n",
       "      <td>[A]</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[(a, 1)]</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'a': 1}</td>\n",
       "      <td>{}</td>\n",
       "      <td>a a a</td>\n",
       "      <td></td>\n",
       "      <td>[a]</td>\n",
       "      <td>[a]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>series</td>\n",
       "      <td>2</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[series]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(series, 1)]</td>\n",
       "      <td>[(series, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'series': 1}</td>\n",
       "      <td>{'series': 1}</td>\n",
       "      <td>series series series</td>\n",
       "      <td>series series series</td>\n",
       "      <td>[series]</td>\n",
       "      <td>[series]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156055</td>\n",
       "      <td>Hearst 's</td>\n",
       "      <td>2</td>\n",
       "      <td>[hearst]</td>\n",
       "      <td>1</td>\n",
       "      <td>[Hearst 's]</td>\n",
       "      <td>1</td>\n",
       "      <td>[hearst]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(hearst, 1)]</td>\n",
       "      <td>[(hearst, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'hearst': 1}</td>\n",
       "      <td>{'hearst': 1}</td>\n",
       "      <td>hearst 's hearst 's hearst s</td>\n",
       "      <td>hearst hearst hearst</td>\n",
       "      <td>[hearst]</td>\n",
       "      <td>[hearst]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156056</td>\n",
       "      <td>forced avuncular chortles</td>\n",
       "      <td>1</td>\n",
       "      <td>[forced, avuncular, chortles]</td>\n",
       "      <td>3</td>\n",
       "      <td>[forced avuncular chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[forced, avuncular, chortles]</td>\n",
       "      <td>3</td>\n",
       "      <td>[(forced, 1), (avuncular, 1), (chortles, 1)]</td>\n",
       "      <td>[(forced, 1), (avuncular, 1), (chortles, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'forced': 1, 'avuncular': 1, 'chortles': 1}</td>\n",
       "      <td>{'forced': 1, 'avuncular': 1, 'chortles': 1}</td>\n",
       "      <td>forced avuncular chortles forced avuncular cho...</td>\n",
       "      <td>forced avuncular chortles forced avuncular cho...</td>\n",
       "      <td>[forced, avuncular, chortles]</td>\n",
       "      <td>[forced, avuncular, chortles]</td>\n",
       "      <td>[forced_avuncular, avuncular_chortles]</td>\n",
       "      <td>[forced_avuncular, avuncular_chortles]</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156057</td>\n",
       "      <td>avuncular chortles</td>\n",
       "      <td>3</td>\n",
       "      <td>[avuncular, chortles]</td>\n",
       "      <td>2</td>\n",
       "      <td>[avuncular chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[avuncular, chortles]</td>\n",
       "      <td>2</td>\n",
       "      <td>[(avuncular, 1), (chortles, 1)]</td>\n",
       "      <td>[(avuncular, 1), (chortles, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'avuncular': 1, 'chortles': 1}</td>\n",
       "      <td>{'avuncular': 1, 'chortles': 1}</td>\n",
       "      <td>avuncular chortles avuncular chortles avuncula...</td>\n",
       "      <td>avuncular chortles avuncular chortles avuncula...</td>\n",
       "      <td>[avuncular, chortles]</td>\n",
       "      <td>[avuncular, chortles]</td>\n",
       "      <td>[avuncular_chortles]</td>\n",
       "      <td>[avuncular_chortles]</td>\n",
       "      <td>3</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156058</td>\n",
       "      <td>avuncular</td>\n",
       "      <td>2</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>1</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>1</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(avuncular, 1)]</td>\n",
       "      <td>[(avuncular, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'avuncular': 1}</td>\n",
       "      <td>{'avuncular': 1}</td>\n",
       "      <td>avuncular avuncular avuncular</td>\n",
       "      <td>avuncular avuncular avuncular</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>[avuncular]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156059</td>\n",
       "      <td>chortles</td>\n",
       "      <td>2</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>1</td>\n",
       "      <td>[(chortles, 1)]</td>\n",
       "      <td>[(chortles, 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>{'chortles': 1}</td>\n",
       "      <td>{'chortles': 1}</td>\n",
       "      <td>chortles chortles chortles</td>\n",
       "      <td>chortles chortles chortles</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>[chortles]</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>2</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>154050 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                        0  labels  \\\n",
       "0       A series of escapades demonstrating the adage ...       1   \n",
       "1       A series of escapades demonstrating the adage ...       2   \n",
       "2                                                A series       2   \n",
       "3                                                       A       2   \n",
       "4                                                  series       2   \n",
       "...                                                   ...     ...   \n",
       "156055                                          Hearst 's       2   \n",
       "156056                          forced avuncular chortles       1   \n",
       "156057                                 avuncular chortles       3   \n",
       "156058                                          avuncular       2   \n",
       "156059                                           chortles       2   \n",
       "\n",
       "                                                   tokens  num_tokens  \\\n",
       "0       [a, series, of, escapades, demonstrating, the,...          35   \n",
       "1       [a, series, of, escapades, demonstrating, the,...          14   \n",
       "2                                             [a, series]           2   \n",
       "3                                                     [a]           1   \n",
       "4                                                [series]           1   \n",
       "...                                                   ...         ...   \n",
       "156055                                           [hearst]           1   \n",
       "156056                      [forced, avuncular, chortles]           3   \n",
       "156057                              [avuncular, chortles]           2   \n",
       "156058                                        [avuncular]           1   \n",
       "156059                                         [chortles]           1   \n",
       "\n",
       "                                                sentences  num_sentences  \\\n",
       "0       [A series of escapades demonstrating the adage...              1   \n",
       "1       [A series of escapades demonstrating the adage...              1   \n",
       "2                                              [A series]              1   \n",
       "3                                                     [A]              1   \n",
       "4                                                [series]              1   \n",
       "...                                                   ...            ...   \n",
       "156055                                        [Hearst 's]              1   \n",
       "156056                        [forced avuncular chortles]              1   \n",
       "156057                               [avuncular chortles]              1   \n",
       "156058                                        [avuncular]              1   \n",
       "156059                                         [chortles]              1   \n",
       "\n",
       "                                                    no_sw  num_no_sw  \\\n",
       "0       [series, escapades, demonstrating, adage, good...         15   \n",
       "1       [series, escapades, demonstrating, adage, good...          6   \n",
       "2                                                [series]          1   \n",
       "3                                                      []          0   \n",
       "4                                                [series]          1   \n",
       "...                                                   ...        ...   \n",
       "156055                                           [hearst]          1   \n",
       "156056                      [forced, avuncular, chortles]          3   \n",
       "156057                              [avuncular, chortles]          2   \n",
       "156058                                        [avuncular]          1   \n",
       "156059                                         [chortles]          1   \n",
       "\n",
       "                                           topwords_unfil  \\\n",
       "0       [(of, 4), (the, 3), (a, 2), (is, 2), (good, 2)...   \n",
       "1       [(the, 2), (a, 1), (series, 1), (of, 1), (esca...   \n",
       "2                                   [(a, 1), (series, 1)]   \n",
       "3                                                [(a, 1)]   \n",
       "4                                           [(series, 1)]   \n",
       "...                                                   ...   \n",
       "156055                                      [(hearst, 1)]   \n",
       "156056       [(forced, 1), (avuncular, 1), (chortles, 1)]   \n",
       "156057                    [(avuncular, 1), (chortles, 1)]   \n",
       "156058                                   [(avuncular, 1)]   \n",
       "156059                                    [(chortles, 1)]   \n",
       "\n",
       "                                             topwords_fil  ... v_pos_fd  \\\n",
       "0       [(good, 2), (series, 1), (escapades, 1), (demo...  ...    0.307   \n",
       "1       [(series, 1), (escapades, 1), (demonstrating, ...  ...    0.367   \n",
       "2                                           [(series, 1)]  ...    0.000   \n",
       "3                                                      []  ...    0.000   \n",
       "4                                           [(series, 1)]  ...    0.000   \n",
       "...                                                   ...  ...      ...   \n",
       "156055                                      [(hearst, 1)]  ...    0.000   \n",
       "156056       [(forced, 1), (avuncular, 1), (chortles, 1)]  ...    0.000   \n",
       "156057                    [(avuncular, 1), (chortles, 1)]  ...    0.000   \n",
       "156058                                   [(avuncular, 1)]  ...    0.000   \n",
       "156059                                    [(chortles, 1)]  ...    0.000   \n",
       "\n",
       "                                                      bow  \\\n",
       "0       {'a': 2, 'series': 1, 'of': 4, 'escapades': 1,...   \n",
       "1       {'a': 1, 'series': 1, 'of': 1, 'escapades': 1,...   \n",
       "2                                   {'a': 1, 'series': 1}   \n",
       "3                                                {'a': 1}   \n",
       "4                                           {'series': 1}   \n",
       "...                                                   ...   \n",
       "156055                                      {'hearst': 1}   \n",
       "156056       {'forced': 1, 'avuncular': 1, 'chortles': 1}   \n",
       "156057                    {'avuncular': 1, 'chortles': 1}   \n",
       "156058                                   {'avuncular': 1}   \n",
       "156059                                    {'chortles': 1}   \n",
       "\n",
       "                                                 bow_nosw  \\\n",
       "0       {'series': 1, 'escapades': 1, 'demonstrating':...   \n",
       "1       {'series': 1, 'escapades': 1, 'demonstrating':...   \n",
       "2                                           {'series': 1}   \n",
       "3                                                      {}   \n",
       "4                                           {'series': 1}   \n",
       "...                                                   ...   \n",
       "156055                                      {'hearst': 1}   \n",
       "156056       {'forced': 1, 'avuncular': 1, 'chortles': 1}   \n",
       "156057                    {'avuncular': 1, 'chortles': 1}   \n",
       "156058                                   {'avuncular': 1}   \n",
       "156059                                    {'chortles': 1}   \n",
       "\n",
       "                                              diy_cleaner  \\\n",
       "0       a series of escapades demonstrating the adage ...   \n",
       "1       a series of escapades demonstrating the adage ...   \n",
       "2                              a series a series a series   \n",
       "3                                                   a a a   \n",
       "4                                    series series series   \n",
       "...                                                   ...   \n",
       "156055                       hearst 's hearst 's hearst s   \n",
       "156056  forced avuncular chortles forced avuncular cho...   \n",
       "156057  avuncular chortles avuncular chortles avuncula...   \n",
       "156058                      avuncular avuncular avuncular   \n",
       "156059                         chortles chortles chortles   \n",
       "\n",
       "                                                   pruned  \\\n",
       "0       series escapades demonstrating adage that what...   \n",
       "1       series escapades demonstrating adage that what...   \n",
       "2                                    series series series   \n",
       "3                                                           \n",
       "4                                    series series series   \n",
       "...                                                   ...   \n",
       "156055                               hearst hearst hearst   \n",
       "156056  forced avuncular chortles forced avuncular cho...   \n",
       "156057  avuncular chortles avuncular chortles avuncula...   \n",
       "156058                      avuncular avuncular avuncular   \n",
       "156059                         chortles chortles chortles   \n",
       "\n",
       "                                                nltk_negs  \\\n",
       "0       [a, series, of, escapades, demonstrating, the,...   \n",
       "1       [a, series, of, escapades, demonstrating, the,...   \n",
       "2                                             [a, series]   \n",
       "3                                                     [a]   \n",
       "4                                                [series]   \n",
       "...                                                   ...   \n",
       "156055                                           [hearst]   \n",
       "156056                      [forced, avuncular, chortles]   \n",
       "156057                              [avuncular, chortles]   \n",
       "156058                                        [avuncular]   \n",
       "156059                                         [chortles]   \n",
       "\n",
       "                                            unigram_feats  \\\n",
       "0       [the, of, is, good, for, of_NEG, a, series, es...   \n",
       "1       [the, a, series, of, escapades, demonstrating,...   \n",
       "2                                             [a, series]   \n",
       "3                                                     [a]   \n",
       "4                                                [series]   \n",
       "...                                                   ...   \n",
       "156055                                           [hearst]   \n",
       "156056                      [forced, avuncular, chortles]   \n",
       "156057                              [avuncular, chortles]   \n",
       "156058                                        [avuncular]   \n",
       "156059                                         [chortles]   \n",
       "\n",
       "                                             bigram_feats  \\\n",
       "0       [a_series, series_of, of_escapades, escapades_...   \n",
       "1       [a_series, series_of, of_escapades, escapades_...   \n",
       "2                                              [a_series]   \n",
       "3                                                      []   \n",
       "4                                                      []   \n",
       "...                                                   ...   \n",
       "156055                                                 []   \n",
       "156056             [forced_avuncular, avuncular_chortles]   \n",
       "156057                               [avuncular_chortles]   \n",
       "156058                                                 []   \n",
       "156059                                                 []   \n",
       "\n",
       "                                         bigram_feats_neg PoN  \n",
       "0       [a_series, series_of, of_escapades, escapades_...   1  \n",
       "1       [a_series, series_of, of_escapades, escapades_...   2  \n",
       "2                                              [a_series]   2  \n",
       "3                                                      []   2  \n",
       "4                                                      []   2  \n",
       "...                                                   ...  ..  \n",
       "156055                                                 []   2  \n",
       "156056             [forced_avuncular, avuncular_chortles]   1  \n",
       "156057                               [avuncular_chortles]   3  \n",
       "156058                                                 []   2  \n",
       "156059                                                 []   2  \n",
       "\n",
       "[154050 rows x 40 columns]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "all_df.to_csv('hw7_data_sentiment_v2.csv',index=False)\n",
    "all_df['PoN'] = all_df['labels']\n",
    "all_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores -- Gaussian\n",
      "Accuracy: 0.5212593313859136\n",
      "[  154   764  1031   212    19   252  1893  4803   937   123   234  1958\n",
      " 18145  2414   782    45   445  4544  3060  1692     2    73   793  1002\n",
      "   838]\n",
      "Accuracy: 0.5159796602834578\n",
      "[  196   649  1029   204    25   382  1799  4858   990    91   394  1848\n",
      " 17786  2783   582    56   467  4522  3305  1391     5    75   797  1221\n",
      "   760]\n",
      "Accuracy: 0.5136427566807313\n",
      "[  211   693  1032   193    32   444  1778  4807   925   123   410  1802\n",
      " 17971  2467   809    49   415  4636  2976  1668     6    77   820  1069\n",
      "   802]\n",
      "Accuracy: 0.515070864437953\n",
      "[  174   701  1037   184    32   319  1826  4896   936   134   333  1949\n",
      " 17924  2570   706    58   434  4647  3069  1578     4    79   802  1012\n",
      "   811]\n",
      "Accuracy: 0.514226982581413\n",
      "[  164   707  1001   192    19   284  1897  4922   929   124   300  1897\n",
      " 17822  2514   775    59   437  4595  3037  1699     3    77   831  1085\n",
      "   845]\n",
      "AVERAGE ACCURACY: 0.5160359190738937\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound','v_pos', 'v_neg', 'v_neu'])\n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores -- Gaussian')\n",
    "# display_NB_tables(tables)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Positive Vader Scores -- Multinomial\n",
      "Accuracy: 0.5118684409823651\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[    0     0  2180     0     0     0     0  8004     4     0     0     0\n",
      " 23351   182     0     0     0  9481   305     0     0     0  2588   120\n",
      "     0]\n",
      "Accuracy: 0.5092935194201017\n",
      "[    0     0  2102     1     0     0     0  8113     7     0     0     0\n",
      " 23180   213     0     0     0  9384   357     0     0     0  2691   167\n",
      "     0]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5111976630963972\n",
      "[    0     0  2159     2     0     0     0  8069     8     0     0     0\n",
      " 23252   207     0     0     0  9371   373     0     0     0  2623   151\n",
      "     0]\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/sklearn/metrics/classification.py:1437: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples.\n",
      "  'precision', 'predicted', average, warn_for)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.5115005950448989\n",
      "[    0     0  2127     1     0     0     0  8104     7     0     0     0\n",
      " 23272   210     0     0     0  9419   367     0     0     0  2539   169\n",
      "     0]\n",
      "Accuracy: 0.5078004976739154\n",
      "[    0     0  2083     0     0     0     0  8149     7     0     0     0\n",
      " 23108   200     0     0     0  9467   360     0     0     0  2715   126\n",
      "     0]\n",
      "AVERAGE ACCURACY: 0.5103321432435356\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_pos','v_neu'])\n",
    "tables = get_NB(small_df, all_df['PoN'], MultinomialNB(), 'Positive Vader Scores -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores from Summary -- Gaussian\n",
      "Accuracy: 0.5090987774532079\n",
      "[  211   661   978   316    14   467  1603  4723  1099   116   459  1685\n",
      " 17672  2919   798    49   404  4436  3246  1651     3    67   751  1091\n",
      "   796]\n",
      "Accuracy: 0.5043600562587904\n",
      "[  225   581   983   291    23   517  1592  4780  1147    84   582  1605\n",
      " 17309  3305   592    60   430  4428  3461  1362     4    70   754  1308\n",
      "   722]\n",
      "Accuracy: 0.5017202207075625\n",
      "[  231   643   973   284    30   570  1555  4740  1100   112   579  1564\n",
      " 17490  3010   816    55   374  4535  3146  1634     4    70   777  1158\n",
      "   765]\n",
      "Accuracy: 0.5042085902845397\n",
      "[  209   627   985   283    24   467  1612  4801  1108   123   497  1713\n",
      " 17466  3041   765    64   389  4529  3245  1559     2    70   763  1103\n",
      "   770]\n",
      "Accuracy: 0.5032565184463919\n",
      "[  210   620   959   278    16   475  1637  4844  1085   115   481  1657\n",
      " 17355  3023   792    68   386  4473  3252  1648     4    71   795  1167\n",
      "   804]\n",
      "AVERAGE ACCURACY: 0.5045288326300985\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum']) \n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores from Summary -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores (original) and Vader Scores (summary) -- Gaussian\n",
      "Accuracy: 0.4752569512063183\n",
      "[  640   391   850   214    85  1510   916  4295   767   520  1550   927\n",
      " 17540  1307  2209   247   369  4122  1393  3655    27    74   676   456\n",
      "  1475]\n",
      "Accuracy: 0.4712971978794764\n",
      "[  638   308   869   185   103  1601   874  4355   818   472  1611   917\n",
      " 17214  1438  2213   279   377  4126  1500  3459    39    55   700   509\n",
      "  1555]\n",
      "Accuracy: 0.47209780374337335\n",
      "[  674   345   855   180   107  1609   891  4338   731   508  1583   918\n",
      " 17384  1295  2279   263   340  4228  1358  3555    43    65   690   465\n",
      "  1511]\n",
      "Accuracy: 0.47082116196040247\n",
      "[  652   333   869   165   109  1491   961  4370   762   527  1575  1000\n",
      " 17349  1224  2334   260   359  4232  1290  3645    34    70   686   411\n",
      "  1507]\n",
      "Accuracy: 0.4714703018500487\n",
      "[  634   339   837   177    96  1534   958  4377   791   496  1559   931\n",
      " 17214  1373  2231   278   350  4171  1453  3575    34    72   707   498\n",
      "  1530]\n",
      "AVERAGE ACCURACY: 0.4721886833279238\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_sum','v_pos_sum', 'v_neg_sum', 'v_neu_sum', \n",
    "                          'v_compound','v_pos', 'v_neg', 'v_neu'])\n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores (original) and Vader Scores (summary) -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vader Scores 50 most frequent filtered words -- Gaussian\n",
      "Accuracy: 0.5117602510007573\n",
      "[  213   711   952   243    61   411  1721  4525  1071   280   432  1889\n",
      " 17660  2728   824    66   497  4304  3111  1808     5    77   729   951\n",
      "   946]\n",
      "Accuracy: 0.5124093908904035\n",
      "[  221   632   969   225    56   454  1714  4513  1248   191   471  1817\n",
      " 17407  3108   590    74   492  4309  3498  1368     8    75   747  1187\n",
      "   841]\n",
      "Accuracy: 0.5063940279130152\n",
      "[  253   658   950   229    71   571  1639  4493  1083   291   529  1764\n",
      " 17521  2769   876    74   471  4386  3062  1751    13    76   751  1006\n",
      "   928]\n",
      "Accuracy: 0.5075841177107\n",
      "[  200   689   953   227    59   370  1761  4539  1136   305   437  1893\n",
      " 17559  2684   909    71   477  4404  2959  1875     7    89   738   895\n",
      "   979]\n",
      "Accuracy: 0.5061560099534783\n",
      "[  218   662   928   215    60   523  1646  4615  1095   277   509  1739\n",
      " 17437  2775   848    81   464  4367  3113  1802     8    80   766  1009\n",
      "   978]\n",
      "AVERAGE ACCURACY: 0.5088607594936708\n"
     ]
    }
   ],
   "source": [
    "small_df = all_df.filter(['v_compound_fd','v_pos_fd', 'v_neu_fd', 'v_neg_fd'])\n",
    "tables = get_NB(small_df, all_df['PoN'], GaussianNB(), 'Vader Scores 50 most frequent filtered words -- Gaussian')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Bag of Words & Machine Learning "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Starting point -- Gaussian\n",
      "Accuracy: 0.18355512279562913\n",
      "[1714  336   44   14   72 4525 2124  196  414  749 9992 3648  806 3145\n",
      " 5942 2204  546  225 1886 4925  365   37   29  324 1953]\n",
      "Accuracy: 0.1868873742291464\n",
      "[1608  371   33   34   57 4519 2162  216  474  749 9978 3492  853 3324\n",
      " 5746 2312  521  233 1960 4715  382   20   33  369 2054]\n"
     ]
    }
   ],
   "source": [
    "all_df['bow_v1'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "all_df\n",
    "new_df = pd.DataFrame(all_df['bow_v1'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Starting point -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Starting point -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v1'] = all_df.apply(lambda x: Counter(x['tokens']), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v1'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Starting point -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Starting point -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'DIY Cleaner -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v2'] = all_df.apply(lambda x: Counter(casual_tokenize(x['diy_cleaner'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v2'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'DIY Cleaner -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'DIY Cleaner -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Pruned Words -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v3'] = all_df.apply(lambda x: Counter(casual_tokenize(x['pruned'])), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v3'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Pruned Words -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Pruned Words -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'NLTK negs -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v4'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['nltk_negs']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v4'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'NLTK negs -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'NLTK negs -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'Bigram Feats -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v5'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['bigram_feats']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v5'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'Bigram Feats -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'Bigram Feats -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, GaussianNB(), 'No Shared Words -- Gaussian')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "all_df['bow_v6'] = all_df.apply(lambda x: Counter(casual_tokenize(' '.join(x['no_shared_words']))), axis=1)\n",
    "new_df = pd.DataFrame(all_df['bow_v6'].tolist(), all_df['PoN'])\n",
    "new_df = new_df.fillna(0).astype(int)\n",
    "new_df[:5]\n",
    "tables = get_NB(new_df, new_df.index, MultinomialNB(), 'No Shared Words -- Multinomial')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "new_df = new_df.astype(bool).astype(int)\n",
    "tables = get_NB(new_df, new_df.index, BernoulliNB(), 'No Shared Words -- Bernoulli')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_cloud_with_mask('yellow_square.png', big_bow, 750, \"Top Words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_cloud_with_mask('red_square.png', big_bow_n, 750, \"Top Negative Words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "create_word_cloud_with_mask('green_square.png', big_bow_p, 750, \"Top Positive Words\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.model_selection import cross_val_score\n",
    "from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n",
    "\n",
    "def runPipeline(classifier, boolean, cv, X, y):\n",
    "    nb_clf_pipe = Pipeline([('vect', CountVectorizer(encoding='latin-1', binary=boolean)),('nb', classifier)])\n",
    "    scores = cross_val_score(nb_clf_pipe, X, y, cv=cv)\n",
    "    avg=sum(scores)/len(scores)\n",
    "#     pretty_line = \"{} | Accuracy using {} -- and booleans? {}\"\n",
    "    pretty_line = \"{} | B? {} | CV: {} | Classifier: {}\"\n",
    "    print(pretty_line.format(avg, str(boolean)[0], cv, str(classifier).split('(')[0]))\n",
    "\n",
    "# X = array of data\n",
    "# y = array of labels\n",
    "\n",
    "hw6 = all_df[[0,'PoN']]\n",
    "X = hw6[0].tolist()\n",
    "y = hw6['PoN'].tolist()\n",
    "\n",
    "runPipeline(BernoulliNB(), False, 5, X=X, y=y)\n",
    "runPipeline(BernoulliNB(), False, 3, X=X, y=y)\n",
    "runPipeline(MultinomialNB(), False, 5, X=X, y=y)\n",
    "runPipeline(MultinomialNB(), False, 3, X=X, y=y)\n",
    "runPipeline(MultinomialNB(), True, 5,  X=X, y=y)\n",
    "runPipeline(MultinomialNB(), True, 3,  X=X, y=y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from tabulate import tabulate\n",
    "\n",
    "df = hw6\n",
    "def shorten(long_string):\n",
    "    return long_string[:1] if len(long_string) < 21 else long_string[:20]\n",
    "\n",
    "def df_for_tabulate(df, column):\n",
    "    pretty_df = df.copy()\n",
    "    pretty_df[column] = pretty_df.apply(lambda x: shorten(x[column]), axis = 1)\n",
    "    return pretty_df\n",
    "    \n",
    "tabulate_df = df_for_tabulate(df, 0)\n",
    "print(tabulate(tabulate_df[:10], tablefmt=\"simple\", headers=tabulate_df.columns))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}