{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW7: Comparing MNB & SVM with Kaggle Sentiment Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## OVERVIEW"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "### VECTORIZERS USED:\n",
    "    CountVectorizer\n",
    "    TfidfVectorizer\n",
    "\n",
    "### MODELS USED:\n",
    "    Multinomial Naive Bayes (MNB)\n",
    "    Support Vector Machines (SVM)\n",
    "---"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "---\n",
    "#### VECTORIZATION PARAMS:\n",
    "    Binary\n",
    "    Stopwords\n",
    "    Unigrams, Bigrams\n",
    "    Min & Max df\n",
    "---\n",
    "\n",
    "#### TODO:\n",
    "    Stemming?\n",
    "    Vadar + TextBlob"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### FUNCTION & PACKAGE PARTY"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "## =======================================================\n",
    "## TOKENIZING\n",
    "## =======================================================\n",
    "from nltk.corpus import stopwords\n",
    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
    "\n",
    "## =======================================================\n",
    "## PREPROCESSING\n",
    "## =======================================================\n",
    "\n",
    "# FIRST - removing anything with 3 or fewer characters\n",
    "# def my_preprocessor(doc):\n",
    "# #     print('PREPROCESSING!!!!!')\n",
    "#     if len(doc) > 3:\n",
    "#         return(doc)\n",
    "#     else:\n",
    "#         return('none')\n",
    "    \n",
    "def my_preprocessor(doc):\n",
    "#     print('PREPROCESSING!!!!!')\n",
    "    if len(doc) > 2:\n",
    "        return(doc)\n",
    "    else:\n",
    "        return('empty')\n",
    "\n",
    "## =======================================================\n",
    "## VECTORIZING\n",
    "## =======================================================\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "\n",
    "## ----- VECTORIZORS\n",
    "unigram_bool_cv_v1 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')\n",
    "unigram_bool_cv_v2 = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english', \n",
    "                                     token_pattern=r'(?u)\\b[a-zA-Z]{2,}\\b' )\n",
    "\n",
    "unigram_cv = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english', \n",
    "                             token_pattern=r'(?u)\\b[a-zA-Z]{2,}\\b' )\n",
    "\n",
    "bigram_cv = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english')\n",
    "bigram_cv_v2 = CountVectorizer(encoding='latin-1', ngram_range=(1,2), min_df=5, stop_words='english', \n",
    "                               token_pattern=r'(?u)\\b[a-zA-Z]{2,}\\b')\n",
    "\n",
    "unigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english')\n",
    "unigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', \n",
    "                                token_pattern=r'(?u)\\b[a-zA-Z]{2,}\\b')\n",
    "\n",
    "bigram_tv = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english')\n",
    "bigram_tv_v2 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5, stop_words='english', \n",
    "                               token_pattern=r'(?u)\\b[a-zA-Z]{2,}\\b')\n",
    "\n",
    "## ----- VECTORIZERS with PREPROCESSING\n",
    "\n",
    "unigram_tv_v3 = TfidfVectorizer(encoding='latin-1', use_idf=True, min_df=5, stop_words='english', \n",
    "                                preprocessor=my_preprocessor, token_pattern=r'(?u)\\b[a-zA-Z]{2,}\\b')\n",
    "bigram_tv_v3 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=5,  \n",
    "                               preprocessor=my_preprocessor, stop_words='english', \n",
    "                               token_pattern=r'(?u)\\b[a-zA-Z]{2,}\\b')\n",
    "\n",
    "bigram_tv_v4 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2),  \n",
    "                               preprocessor=my_preprocessor, stop_words='english', \n",
    "                               token_pattern=r'(?u)\\b[a-zA-Z]{2,}\\b')\n",
    "\n",
    "bigram_tv_v5 = TfidfVectorizer(encoding='latin-1', use_idf=True, ngram_range=(1,2), min_df=3,\n",
    "                               preprocessor=my_preprocessor, stop_words='english', \n",
    "                               token_pattern=r'(?u)\\b[a-zA-Z]{2,}\\b')\n",
    "\n",
    "## =======================================================\n",
    "## MODELING\n",
    "## =======================================================\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from sklearn.svm import LinearSVC\n",
    "from sklearn.naive_bayes import BernoulliNB, MultinomialNB\n",
    "\n",
    "## ----- CLASSIFIERS\n",
    "mnb = MultinomialNB()\n",
    "svm = LinearSVC(C=1)\n",
    "\n",
    "def get_test_train_vec(X,y,vectorizer):\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)\n",
    "    X_train_vec = vectorizer.fit_transform(X_train)\n",
    "    X_test_vec = vectorizer.transform(X_test)\n",
    "    return X_train_vec, X_test_vec, y_train, y_test\n",
    "\n",
    "def run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier):\n",
    "    clf = classifier\n",
    "    clf.fit(X_train_vec,y_train)\n",
    "    y_pred = clf.predict(X_test_vec)\n",
    "    report = classification_report(y_test, y_pred, target_names=target_names,output_dict=True)\n",
    "    score = clf.score(X_test_vec,y_test)\n",
    "    return clf, score, report\n",
    "    \n",
    "def get_model(X, y, labels, target_names, classifier, vec):\n",
    "    X_train_vec, X_test_vec, y_train, y_test = get_test_train_vec(X,y,vec)\n",
    "    model, score, report = run_classifier(X_train_vec, X_test_vec, y_train, y_test, labels, target_names, classifier)\n",
    "    return model, score, report\n",
    "    \n",
    "## =======================================================\n",
    "## VISUALIZING\n",
    "## =======================================================\n",
    "from tabulate import tabulate\n",
    "import pandas as pd\n",
    "\n",
    "def return_features(vec, model):\n",
    "    for i,feature_probability in enumerate(model.coef_):\n",
    "        print('============ Sentiment Score: ', i)\n",
    "        df1 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[:10])\n",
    "        df2 = pd.DataFrame(sorted(zip(feature_probability, vec.get_feature_names()))[-10:])\n",
    "        df3 = pd.concat([df1, df2], axis=1)\n",
    "        print(tabulate(df3, tablefmt=\"fancy_grid\", headers=[\"Most\",\"Likely\",\"Least\",\"Likely\"], floatfmt=\".2f\"))\n",
    "\n",
    "def update_big_df(big_df, new_row):\n",
    "    big_df.append(new_row)\n",
    "    df = pd.DataFrame(big_df)\n",
    "    df = df.drop_duplicates()\n",
    "    return df\n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### DATA GOES HERE:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "# import pandas as pd\n",
    "train=pd.read_csv(\"kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values\n",
    "big_df = []\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 144,
   "metadata": {},
   "outputs": [],
   "source": [
    "train=pd.read_csv(\"../HW2/hw7_data_sentiment.csv\")\n",
    "train.head()\n",
    "# y=train['labels'].values\n",
    "# X=train['pruned'].values\n",
    "\n",
    "def remove_na(string):\n",
    "#     print(type(string))\n",
    "    if type(string) == str:\n",
    "        return string\n",
    "    else:\n",
    "        return \"empty\"\n",
    "train['pruned_2'] = train.apply(lambda x: remove_na(x['pruned']), axis= 1)\n",
    "\n",
    "y=train['labels'].values\n",
    "X=train['pruned_2'].values\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 153,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>0</th>\n",
       "      <th>labels</th>\n",
       "      <th>tokens</th>\n",
       "      <th>num_tokens</th>\n",
       "      <th>sentences</th>\n",
       "      <th>num_sentences</th>\n",
       "      <th>no_sw</th>\n",
       "      <th>num_no_sw</th>\n",
       "      <th>topwords_unfil</th>\n",
       "      <th>topwords_fil</th>\n",
       "      <th>...</th>\n",
       "      <th>v_pos_fd</th>\n",
       "      <th>bow</th>\n",
       "      <th>bow_nosw</th>\n",
       "      <th>diy_cleaner</th>\n",
       "      <th>pruned</th>\n",
       "      <th>nltk_negs</th>\n",
       "      <th>unigram_feats</th>\n",
       "      <th>bigram_feats</th>\n",
       "      <th>bigram_feats_neg</th>\n",
       "      <th>pruned_2</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "      <td>['a', 'series', 'of', 'escapades', 'demonstrat...</td>\n",
       "      <td>35</td>\n",
       "      <td>['A series of escapades demonstrating the adag...</td>\n",
       "      <td>1</td>\n",
       "      <td>['series', 'escapades', 'demonstrating', 'adag...</td>\n",
       "      <td>15</td>\n",
       "      <td>[('of', 4), ('the', 3), ('a', 2), ('is', 2), (...</td>\n",
       "      <td>[('good', 2), ('series', 1), ('escapades', 1),...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.307</td>\n",
       "      <td>Counter({'of': 4, 'the': 3, 'a': 2, 'is': 2, '...</td>\n",
       "      <td>Counter({'good': 2, 'series': 1, 'escapades': ...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>['a', 'series', 'of', 'escapades', 'demonstrat...</td>\n",
       "      <td>['the', 'of', 'is', 'good', 'for', 'of_NEG', '...</td>\n",
       "      <td>['a_series', 'series_of', 'of_escapades', 'esc...</td>\n",
       "      <td>['a_series', 'series_of', 'of_escapades', 'esc...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>2</td>\n",
       "      <td>['a', 'series', 'of', 'escapades', 'demonstrat...</td>\n",
       "      <td>14</td>\n",
       "      <td>['A series of escapades demonstrating the adag...</td>\n",
       "      <td>1</td>\n",
       "      <td>['series', 'escapades', 'demonstrating', 'adag...</td>\n",
       "      <td>6</td>\n",
       "      <td>[('the', 2), ('a', 1), ('series', 1), ('of', 1...</td>\n",
       "      <td>[('series', 1), ('escapades', 1), ('demonstrat...</td>\n",
       "      <td>...</td>\n",
       "      <td>0.367</td>\n",
       "      <td>Counter({'the': 2, 'a': 1, 'series': 1, 'of': ...</td>\n",
       "      <td>Counter({'series': 1, 'escapades': 1, 'demonst...</td>\n",
       "      <td>a series of escapades demonstrating the adage ...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "      <td>['a', 'series', 'of', 'escapades', 'demonstrat...</td>\n",
       "      <td>['the', 'a', 'series', 'of', 'escapades', 'dem...</td>\n",
       "      <td>['a_series', 'series_of', 'of_escapades', 'esc...</td>\n",
       "      <td>['a_series', 'series_of', 'of_escapades', 'esc...</td>\n",
       "      <td>series escapades demonstrating adage that what...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A series</td>\n",
       "      <td>2</td>\n",
       "      <td>['a', 'series']</td>\n",
       "      <td>2</td>\n",
       "      <td>['A series']</td>\n",
       "      <td>1</td>\n",
       "      <td>['series']</td>\n",
       "      <td>1</td>\n",
       "      <td>[('a', 1), ('series', 1)]</td>\n",
       "      <td>[('series', 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>Counter({'a': 1, 'series': 1})</td>\n",
       "      <td>Counter({'series': 1})</td>\n",
       "      <td>a series a series a series</td>\n",
       "      <td>series series series</td>\n",
       "      <td>['a', 'series']</td>\n",
       "      <td>['a', 'series']</td>\n",
       "      <td>['a_series']</td>\n",
       "      <td>['a_series']</td>\n",
       "      <td>series series series</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A</td>\n",
       "      <td>2</td>\n",
       "      <td>['a']</td>\n",
       "      <td>1</td>\n",
       "      <td>['A']</td>\n",
       "      <td>1</td>\n",
       "      <td>[]</td>\n",
       "      <td>0</td>\n",
       "      <td>[('a', 1)]</td>\n",
       "      <td>[]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>Counter({'a': 1})</td>\n",
       "      <td>Counter()</td>\n",
       "      <td>a a a</td>\n",
       "      <td>NaN</td>\n",
       "      <td>['a']</td>\n",
       "      <td>['a']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>empty</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>series</td>\n",
       "      <td>2</td>\n",
       "      <td>['series']</td>\n",
       "      <td>1</td>\n",
       "      <td>['series']</td>\n",
       "      <td>1</td>\n",
       "      <td>['series']</td>\n",
       "      <td>1</td>\n",
       "      <td>[('series', 1)]</td>\n",
       "      <td>[('series', 1)]</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000</td>\n",
       "      <td>Counter({'series': 1})</td>\n",
       "      <td>Counter({'series': 1})</td>\n",
       "      <td>series series series</td>\n",
       "      <td>series series series</td>\n",
       "      <td>['series']</td>\n",
       "      <td>['series']</td>\n",
       "      <td>[]</td>\n",
       "      <td>[]</td>\n",
       "      <td>series series series</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 40 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   0  labels  \\\n",
       "0  A series of escapades demonstrating the adage ...       1   \n",
       "1  A series of escapades demonstrating the adage ...       2   \n",
       "2                                           A series       2   \n",
       "3                                                  A       2   \n",
       "4                                             series       2   \n",
       "\n",
       "                                              tokens  num_tokens  \\\n",
       "0  ['a', 'series', 'of', 'escapades', 'demonstrat...          35   \n",
       "1  ['a', 'series', 'of', 'escapades', 'demonstrat...          14   \n",
       "2                                    ['a', 'series']           2   \n",
       "3                                              ['a']           1   \n",
       "4                                         ['series']           1   \n",
       "\n",
       "                                           sentences  num_sentences  \\\n",
       "0  ['A series of escapades demonstrating the adag...              1   \n",
       "1  ['A series of escapades demonstrating the adag...              1   \n",
       "2                                       ['A series']              1   \n",
       "3                                              ['A']              1   \n",
       "4                                         ['series']              1   \n",
       "\n",
       "                                               no_sw  num_no_sw  \\\n",
       "0  ['series', 'escapades', 'demonstrating', 'adag...         15   \n",
       "1  ['series', 'escapades', 'demonstrating', 'adag...          6   \n",
       "2                                         ['series']          1   \n",
       "3                                                 []          0   \n",
       "4                                         ['series']          1   \n",
       "\n",
       "                                      topwords_unfil  \\\n",
       "0  [('of', 4), ('the', 3), ('a', 2), ('is', 2), (...   \n",
       "1  [('the', 2), ('a', 1), ('series', 1), ('of', 1...   \n",
       "2                          [('a', 1), ('series', 1)]   \n",
       "3                                         [('a', 1)]   \n",
       "4                                    [('series', 1)]   \n",
       "\n",
       "                                        topwords_fil  ... v_pos_fd  \\\n",
       "0  [('good', 2), ('series', 1), ('escapades', 1),...  ...    0.307   \n",
       "1  [('series', 1), ('escapades', 1), ('demonstrat...  ...    0.367   \n",
       "2                                    [('series', 1)]  ...    0.000   \n",
       "3                                                 []  ...    0.000   \n",
       "4                                    [('series', 1)]  ...    0.000   \n",
       "\n",
       "                                                 bow  \\\n",
       "0  Counter({'of': 4, 'the': 3, 'a': 2, 'is': 2, '...   \n",
       "1  Counter({'the': 2, 'a': 1, 'series': 1, 'of': ...   \n",
       "2                     Counter({'a': 1, 'series': 1})   \n",
       "3                                  Counter({'a': 1})   \n",
       "4                             Counter({'series': 1})   \n",
       "\n",
       "                                            bow_nosw  \\\n",
       "0  Counter({'good': 2, 'series': 1, 'escapades': ...   \n",
       "1  Counter({'series': 1, 'escapades': 1, 'demonst...   \n",
       "2                             Counter({'series': 1})   \n",
       "3                                          Counter()   \n",
       "4                             Counter({'series': 1})   \n",
       "\n",
       "                                         diy_cleaner  \\\n",
       "0  a series of escapades demonstrating the adage ...   \n",
       "1  a series of escapades demonstrating the adage ...   \n",
       "2                         a series a series a series   \n",
       "3                                              a a a   \n",
       "4                               series series series   \n",
       "\n",
       "                                              pruned  \\\n",
       "0  series escapades demonstrating adage that what...   \n",
       "1  series escapades demonstrating adage that what...   \n",
       "2                               series series series   \n",
       "3                                                NaN   \n",
       "4                               series series series   \n",
       "\n",
       "                                           nltk_negs  \\\n",
       "0  ['a', 'series', 'of', 'escapades', 'demonstrat...   \n",
       "1  ['a', 'series', 'of', 'escapades', 'demonstrat...   \n",
       "2                                    ['a', 'series']   \n",
       "3                                              ['a']   \n",
       "4                                         ['series']   \n",
       "\n",
       "                                       unigram_feats  \\\n",
       "0  ['the', 'of', 'is', 'good', 'for', 'of_NEG', '...   \n",
       "1  ['the', 'a', 'series', 'of', 'escapades', 'dem...   \n",
       "2                                    ['a', 'series']   \n",
       "3                                              ['a']   \n",
       "4                                         ['series']   \n",
       "\n",
       "                                        bigram_feats  \\\n",
       "0  ['a_series', 'series_of', 'of_escapades', 'esc...   \n",
       "1  ['a_series', 'series_of', 'of_escapades', 'esc...   \n",
       "2                                       ['a_series']   \n",
       "3                                                 []   \n",
       "4                                                 []   \n",
       "\n",
       "                                    bigram_feats_neg  \\\n",
       "0  ['a_series', 'series_of', 'of_escapades', 'esc...   \n",
       "1  ['a_series', 'series_of', 'of_escapades', 'esc...   \n",
       "2                                       ['a_series']   \n",
       "3                                                 []   \n",
       "4                                                 []   \n",
       "\n",
       "                                            pruned_2  \n",
       "0  series escapades demonstrating adage that what...  \n",
       "1  series escapades demonstrating adage that what...  \n",
       "2                               series series series  \n",
       "3                                              empty  \n",
       "4                               series series series  \n",
       "\n",
       "[5 rows x 40 columns]"
      ]
     },
     "execution_count": 153,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "big_df = []\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# TASK 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## With Preprocessing"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 147,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============ Sentiment Score:  0\n",
      "╒════╤════════╤═══════════╤═════════╤════════════╕\n",
      "│    │   Most │ Likely    │   Least │ Likely     │\n",
      "╞════╪════════╪═══════════╪═════════╪════════════╡\n",
      "│  0 │  -9.92 │ aaliyah   │   -6.55 │ time       │\n",
      "├────┼────────┼───────────┼─────────┼────────────┤\n",
      "│  1 │  -9.92 │ abandoned │   -6.55 │ characters │\n",
      "├────┼────────┼───────────┼─────────┼────────────┤\n",
      "│  2 │  -9.92 │ abbott    │   -6.47 │ comedy     │\n",
      "├────┼────────┼───────────┼─────────┼────────────┤\n",
      "│  3 │  -9.92 │ abdul     │   -6.45 │ dull       │\n",
      "├────┼────────┼───────────┼─────────┼────────────┤\n",
      "│  4 │  -9.92 │ abel      │   -6.37 │ minutes    │\n",
      "├────┼────────┼───────────┼─────────┼────────────┤\n",
      "│  5 │  -9.92 │ ably      │   -6.12 │ worst      │\n",
      "├────┼────────┼───────────┼─────────┼────────────┤\n",
      "│  6 │  -9.92 │ aborted   │   -6.09 │ just       │\n",
      "├────┼────────┼───────────┼─────────┼────────────┤\n",
      "│  7 │  -9.92 │ abound    │   -5.92 │ like       │\n",
      "├────┼────────┼───────────┼─────────┼────────────┤\n",
      "│  8 │  -9.92 │ abrahams  │   -5.75 │ film       │\n",
      "├────┼────────┼───────────┼─────────┼────────────┤\n",
      "│  9 │  -9.92 │ abridged  │   -4.98 │ movie      │\n",
      "╘════╧════════╧═══════════╧═════════╧════════════╛\n",
      "============ Sentiment Score:  1\n",
      "╒════╤════════╤══════════════╤═════════╤════════════╕\n",
      "│    │   Most │ Likely       │   Least │ Likely     │\n",
      "╞════╪════════╪══════════════╪═════════╪════════════╡\n",
      "│  0 │ -10.63 │ abel         │   -6.35 │ time       │\n",
      "├────┼────────┼──────────────┼─────────┼────────────┤\n",
      "│  1 │ -10.63 │ ably         │   -6.28 │ plot       │\n",
      "├────┼────────┼──────────────┼─────────┼────────────┤\n",
      "│  2 │ -10.63 │ abound       │   -6.20 │ characters │\n",
      "├────┼────────┼──────────────┼─────────┼────────────┤\n",
      "│  3 │ -10.63 │ abrahams     │   -6.00 │ story      │\n",
      "├────┼────────┼──────────────┼─────────┼────────────┤\n",
      "│  4 │ -10.63 │ abroad       │   -5.97 │ little     │\n",
      "├────┼────────┼──────────────┼─────────┼────────────┤\n",
      "│  5 │ -10.63 │ absorb       │   -5.91 │ does       │\n",
      "├────┼────────┼──────────────┼─────────┼────────────┤\n",
      "│  6 │ -10.63 │ accentuating │   -5.91 │ just       │\n",
      "├────┼────────┼──────────────┼─────────┼────────────┤\n",
      "│  7 │ -10.63 │ access       │   -5.67 │ like       │\n",
      "├────┼────────┼──────────────┼─────────┼────────────┤\n",
      "│  8 │ -10.63 │ accidental   │   -5.41 │ film       │\n",
      "├────┼────────┼──────────────┼─────────┼────────────┤\n",
      "│  9 │ -10.63 │ acclaim      │   -5.13 │ movie      │\n",
      "╘════╧════════╧══════════════╧═════════╧════════════╛\n",
      "============ Sentiment Score:  2\n",
      "╒════╤════════╤════════════════╤═════════╤════════════╕\n",
      "│    │   Most │ Likely         │   Least │ Likely     │\n",
      "╞════╪════════╪════════════════╪═════════╪════════════╡\n",
      "│  0 │ -11.26 │ acclaim        │   -6.21 │ little     │\n",
      "├────┼────────┼────────────────┼─────────┼────────────┤\n",
      "│  1 │ -11.26 │ act            │   -6.17 │ movies     │\n",
      "├────┼────────┼────────────────┼─────────┼────────────┤\n",
      "│  2 │ -11.26 │ acumen         │   -6.09 │ rrb        │\n",
      "├────┼────────┼────────────────┼─────────┼────────────┤\n",
      "│  3 │ -11.26 │ adding         │   -6.09 │ characters │\n",
      "├────┼────────┼────────────────┼─────────┼────────────┤\n",
      "│  4 │ -11.26 │ admirers       │   -6.03 │ life       │\n",
      "├────┼────────┼────────────────┼─────────┼────────────┤\n",
      "│  5 │ -11.26 │ adorability    │   -5.90 │ time       │\n",
      "├────┼────────┼────────────────┼─────────┼────────────┤\n",
      "│  6 │ -11.26 │ affectionately │   -5.78 │ story      │\n",
      "├────┼────────┼────────────────┼─────────┼────────────┤\n",
      "│  7 │ -11.26 │ affirms        │   -5.76 │ like       │\n",
      "├────┼────────┼────────────────┼─────────┼────────────┤\n",
      "│  8 │ -11.26 │ ailments       │   -5.12 │ movie      │\n",
      "├────┼────────┼────────────────┼─────────┼────────────┤\n",
      "│  9 │ -11.26 │ airhead        │   -5.07 │ film       │\n",
      "╘════╧════════╧════════════════╧═════════╧════════════╛\n",
      "============ Sentiment Score:  3\n",
      "╒════╤════════╤═════════════╤═════════╤══════════╕\n",
      "│    │   Most │ Likely      │   Least │ Likely   │\n",
      "╞════╪════════╪═════════════╪═════════╪══════════╡\n",
      "│  0 │ -10.77 │ aaliyah     │   -6.14 │ best     │\n",
      "├────┼────────┼─────────────┼─────────┼──────────┤\n",
      "│  1 │ -10.77 │ abbott      │   -6.12 │ life     │\n",
      "├────┼────────┼─────────────┼─────────┼──────────┤\n",
      "│  2 │ -10.77 │ abdul       │   -6.12 │ time     │\n",
      "├────┼────────┼─────────────┼─────────┼──────────┤\n",
      "│  3 │ -10.77 │ abhorrent   │   -6.11 │ like     │\n",
      "├────┼────────┼─────────────┼─────────┼──────────┤\n",
      "│  4 │ -10.77 │ abomination │   -6.01 │ love     │\n",
      "├────┼────────┼─────────────┼─────────┼──────────┤\n",
      "│  5 │ -10.77 │ aborted     │   -6.01 │ story    │\n",
      "├────┼────────┼─────────────┼─────────┼──────────┤\n",
      "│  6 │ -10.77 │ abridged    │   -5.61 │ funny    │\n",
      "├────┼────────┼─────────────┼─────────┼──────────┤\n",
      "│  7 │ -10.77 │ abrupt      │   -5.32 │ movie    │\n",
      "├────┼────────┼─────────────┼─────────┼──────────┤\n",
      "│  8 │ -10.77 │ absence     │   -5.26 │ good     │\n",
      "├────┼────────┼─────────────┼─────────┼──────────┤\n",
      "│  9 │ -10.77 │ absent      │   -5.09 │ film     │\n",
      "╘════╧════════╧═════════════╧═════════╧══════════╛\n",
      "============ Sentiment Score:  4\n",
      "╒════╤════════╤═════════════╤═════════╤══════════════╕\n",
      "│    │   Most │ Likely      │   Least │ Likely       │\n",
      "╞════╪════════╪═════════════╪═════════╪══════════════╡\n",
      "│  0 │ -10.04 │ aaliyah     │   -6.19 │ entertaining │\n",
      "├────┼────────┼─────────────┼─────────┼──────────────┤\n",
      "│  1 │ -10.04 │ abandon     │   -6.18 │ comedy       │\n",
      "├────┼────────┼─────────────┼─────────┼──────────────┤\n",
      "│  2 │ -10.04 │ abandoned   │   -6.12 │ performance  │\n",
      "├────┼────────┼─────────────┼─────────┼──────────────┤\n",
      "│  3 │ -10.04 │ abbott      │   -6.03 │ performances │\n",
      "├────┼────────┼─────────────┼─────────┼──────────────┤\n",
      "│  4 │ -10.04 │ abdul       │   -5.92 │ great        │\n",
      "├────┼────────┼─────────────┼─────────┼──────────────┤\n",
      "│  5 │ -10.04 │ abhorrent   │   -5.79 │ good         │\n",
      "├────┼────────┼─────────────┼─────────┼──────────────┤\n",
      "│  6 │ -10.04 │ abject      │   -5.62 │ funny        │\n",
      "├────┼────────┼─────────────┼─────────┼──────────────┤\n",
      "│  7 │ -10.04 │ ably        │   -5.47 │ movie        │\n",
      "├────┼────────┼─────────────┼─────────┼──────────────┤\n",
      "│  8 │ -10.04 │ abomination │   -5.32 │ best         │\n",
      "├────┼────────┼─────────────┼─────────┼──────────────┤\n",
      "│  9 │ -10.04 │ aborted     │   -5.09 │ film         │\n",
      "╘════╧════════╧═════════════╧═════════╧══════════════╛\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>classifier</th>\n",
       "      <th>vectorizer</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>mnb</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.579779</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  classifier vectorizer     score\n",
       "0        mnb         V1  0.579779"
      ]
     },
     "execution_count": 147,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec = unigram_tv_v3\n",
    "classifier = mnb\n",
    "\n",
    "model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "return_features(vec, model)\n",
    "\n",
    "df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 148,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============ Sentiment Score:  0\n",
      "╒════╤════════╤═══════════════════╤═════════╤════════════╕\n",
      "│    │   Most │ Likely            │   Least │ Likely     │\n",
      "╞════╪════════╪═══════════════════╪═════════╪════════════╡\n",
      "│  0 │ -10.71 │ aaliyah           │   -7.63 │ characters │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  1 │ -10.71 │ abandon political │   -7.63 │ time       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  2 │ -10.71 │ abandoned         │   -7.53 │ comedy     │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  3 │ -10.71 │ abbott            │   -7.48 │ minutes    │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  4 │ -10.71 │ abdul             │   -7.47 │ dull       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  5 │ -10.71 │ abdul malik       │   -7.27 │ worst      │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  6 │ -10.71 │ abel              │   -7.12 │ just       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  7 │ -10.71 │ abel ferrara      │   -7.03 │ like       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  8 │ -10.71 │ ability document  │   -6.83 │ film       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  9 │ -10.71 │ ability images    │   -6.08 │ movie      │\n",
      "╘════╧════════╧═══════════════════╧═════════╧════════════╛\n",
      "============ Sentiment Score:  1\n",
      "╒════╤════════╤═══════════════════╤═════════╤════════════╕\n",
      "│    │   Most │ Likely            │   Least │ Likely     │\n",
      "╞════╪════════╪═══════════════════╪═════════╪════════════╡\n",
      "│  0 │ -11.16 │ abandon political │   -7.18 │ time       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  1 │ -11.16 │ abel              │   -7.07 │ plot       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  2 │ -11.16 │ abel ferrara      │   -6.97 │ characters │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  3 │ -11.16 │ ability document  │   -6.83 │ story      │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  4 │ -11.16 │ ability images    │   -6.80 │ little     │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  5 │ -11.16 │ ability make      │   -6.75 │ does       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  6 │ -11.16 │ ability spoof     │   -6.71 │ just       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  7 │ -11.16 │ abject suffering  │   -6.50 │ like       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  8 │ -11.16 │ able accomplish   │   -6.24 │ film       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  9 │ -11.16 │ able look         │   -5.97 │ movie      │\n",
      "╘════╧════════╧═══════════════════╧═════════╧════════════╛\n",
      "============ Sentiment Score:  2\n",
      "╒════╤════════╤══════════════════════╤═════════╤════════════╕\n",
      "│    │   Most │ Likely               │   Least │ Likely     │\n",
      "╞════╪════════╪══════════════════════╪═════════╪════════════╡\n",
      "│  0 │ -11.64 │ abandon theater      │   -6.89 │ little     │\n",
      "├────┼────────┼──────────────────────┼─────────┼────────────┤\n",
      "│  1 │ -11.64 │ able performances    │   -6.87 │ movies     │\n",
      "├────┼────────┼──────────────────────┼─────────┼────────────┤\n",
      "│  2 │ -11.64 │ able project         │   -6.80 │ characters │\n",
      "├────┼────────┼──────────────────────┼─────────┼────────────┤\n",
      "│  3 │ -11.64 │ able tear            │   -6.75 │ life       │\n",
      "├────┼────────┼──────────────────────┼─────────┼────────────┤\n",
      "│  4 │ -11.64 │ abrupt drop          │   -6.71 │ rrb        │\n",
      "├────┼────────┼──────────────────────┼─────────┼────────────┤\n",
      "│  5 │ -11.64 │ absolute delight     │   -6.62 │ time       │\n",
      "├────┼────────┼──────────────────────┼─────────┼────────────┤\n",
      "│  6 │ -11.64 │ absolutely amazing   │   -6.50 │ story      │\n",
      "├────┼────────┼──────────────────────┼─────────┼────────────┤\n",
      "│  7 │ -11.64 │ absolutely essential │   -6.46 │ like       │\n",
      "├────┼────────┼──────────────────────┼─────────┼────────────┤\n",
      "│  8 │ -11.64 │ absorbing look       │   -5.85 │ movie      │\n",
      "├────┼────────┼──────────────────────┼─────────┼────────────┤\n",
      "│  9 │ -11.64 │ absorbing piece      │   -5.81 │ film       │\n",
      "╘════╧════════╧══════════════════════╧═════════╧════════════╛\n",
      "============ Sentiment Score:  3\n",
      "╒════╤════════╤═════════════════════╤═════════╤══════════╕\n",
      "│    │   Most │ Likely              │   Least │ Likely   │\n",
      "╞════╪════════╪═════════════════════╪═════════╪══════════╡\n",
      "│  0 │ -11.26 │ aaliyah             │   -6.95 │ time     │\n",
      "├────┼────────┼─────────────────────┼─────────┼──────────┤\n",
      "│  1 │ -11.26 │ abandon theater     │   -6.95 │ comedy   │\n",
      "├────┼────────┼─────────────────────┼─────────┼──────────┤\n",
      "│  2 │ -11.26 │ abbott              │   -6.95 │ life     │\n",
      "├────┼────────┼─────────────────────┼─────────┼──────────┤\n",
      "│  3 │ -11.26 │ abdul               │   -6.94 │ like     │\n",
      "├────┼────────┼─────────────────────┼─────────┼──────────┤\n",
      "│  4 │ -11.26 │ abdul malik         │   -6.84 │ love     │\n",
      "├────┼────────┼─────────────────────┼─────────┼──────────┤\n",
      "│  5 │ -11.26 │ abhorrent           │   -6.79 │ story    │\n",
      "├────┼────────┼─────────────────────┼─────────┼──────────┤\n",
      "│  6 │ -11.26 │ abhorrent abhorrent │   -6.42 │ funny    │\n",
      "├────┼────────┼─────────────────────┼─────────┼──────────┤\n",
      "│  7 │ -11.26 │ able enjoy          │   -6.15 │ movie    │\n",
      "├────┼────────┼─────────────────────┼─────────┼──────────┤\n",
      "│  8 │ -11.26 │ able project        │   -6.09 │ good     │\n",
      "├────┼────────┼─────────────────────┼─────────┼──────────┤\n",
      "│  9 │ -11.26 │ abomination         │   -5.90 │ film     │\n",
      "╘════╧════════╧═════════════════════╧═════════╧══════════╛\n",
      "============ Sentiment Score:  4\n",
      "╒════╤════════╤═══════════════════╤═════════╤══════════════╕\n",
      "│    │   Most │ Likely            │   Least │ Likely       │\n",
      "╞════╪════════╪═══════════════════╪═════════╪══════════════╡\n",
      "│  0 │ -10.78 │ aaliyah           │   -7.22 │ comedy       │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────────┤\n",
      "│  1 │ -10.78 │ abandon           │   -7.15 │ entertaining │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────────┤\n",
      "│  2 │ -10.78 │ abandon political │   -7.08 │ performance  │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────────┤\n",
      "│  3 │ -10.78 │ abandon scripts   │   -7.00 │ performances │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────────┤\n",
      "│  4 │ -10.78 │ abandon theater   │   -6.99 │ great        │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────────┤\n",
      "│  5 │ -10.78 │ abandoned         │   -6.89 │ good         │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────────┤\n",
      "│  6 │ -10.78 │ abbott            │   -6.64 │ funny        │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────────┤\n",
      "│  7 │ -10.78 │ abdul             │   -6.53 │ movie        │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────────┤\n",
      "│  8 │ -10.78 │ abdul malik       │   -6.45 │ best         │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────────┤\n",
      "│  9 │ -10.78 │ abhorrent         │   -6.13 │ film         │\n",
      "╘════╧════════╧═══════════════════╧═════════╧══════════════╛\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>classifier</th>\n",
       "      <th>vectorizer</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>mnb</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.579779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>mnb</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.593444</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  classifier vectorizer     score\n",
       "0        mnb         V1  0.579779\n",
       "1        mnb         V1  0.593444"
      ]
     },
     "execution_count": 148,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec = bigram_tv_v3\n",
    "classifier = mnb\n",
    "\n",
    "model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "return_features(vec, model)\n",
    "\n",
    "df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 149,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============ Sentiment Score:  0\n",
      "╒════╤════════╤══════════════════════╤═════════╤══════════════════════╕\n",
      "│    │   Most │ Likely               │   Least │ Likely               │\n",
      "╞════╪════════╪══════════════════════╪═════════╪══════════════════════╡\n",
      "│  0 │  -1.32 │ hawke                │    2.00 │ entirely witless     │\n",
      "├────┼────────┼──────────────────────┼─────────┼──────────────────────┤\n",
      "│  1 │  -1.22 │ loving               │    2.01 │ hideously            │\n",
      "├────┼────────┼──────────────────────┼─────────┼──────────────────────┤\n",
      "│  2 │  -1.21 │ works minutes        │    2.02 │ time stinker         │\n",
      "├────┼────────┼──────────────────────┼─────────┼──────────────────────┤\n",
      "│  3 │  -1.13 │ flick film           │    2.06 │ disgusting           │\n",
      "├────┼────────┼──────────────────────┼─────────┼──────────────────────┤\n",
      "│  4 │  -1.13 │ movie does           │    2.08 │ minded stereotypical │\n",
      "├────┼────────┼──────────────────────┼─────────┼──────────────────────┤\n",
      "│  5 │  -1.12 │ mind right           │    2.12 │ unappealing          │\n",
      "├────┼────────┼──────────────────────┼─────────┼──────────────────────┤\n",
      "│  6 │  -1.11 │ watching documentary │    2.18 │ unwatchable          │\n",
      "├────┼────────┼──────────────────────┼─────────┼──────────────────────┤\n",
      "│  7 │  -1.11 │ fair share           │    2.19 │ film barely          │\n",
      "├────┼────────┼──────────────────────┼─────────┼──────────────────────┤\n",
      "│  8 │  -1.08 │ film make            │    2.34 │ premise just         │\n",
      "├────┼────────┼──────────────────────┼─────────┼──────────────────────┤\n",
      "│  9 │  -1.07 │ trash cinema         │    2.71 │ disappointment       │\n",
      "╘════╧════════╧══════════════════════╧═════════╧══════════════════════╛\n",
      "============ Sentiment Score:  1\n",
      "╒════╤════════╤═════════════════════════╤═════════╤═════════════════════╕\n",
      "│    │   Most │ Likely                  │   Least │ Likely              │\n",
      "╞════╪════════╪═════════════════════════╪═════════╪═════════════════════╡\n",
      "│  0 │  -1.90 │ hard stop               │    2.04 │ animation years     │\n",
      "├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤\n",
      "│  1 │  -1.87 │ unlikable uninteresting │    2.05 │ informative titular │\n",
      "├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤\n",
      "│  2 │  -1.74 │ sensibility             │    2.06 │ does live           │\n",
      "├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤\n",
      "│  3 │  -1.74 │ minded stereotypical    │    2.07 │ pretty mediocre     │\n",
      "├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤\n",
      "│  4 │  -1.67 │ degenerating pious      │    2.09 │ losing touch        │\n",
      "├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤\n",
      "│  5 │  -1.63 │ calls                   │    2.09 │ organic intrigue    │\n",
      "├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤\n",
      "│  6 │  -1.62 │ movie quirky            │    2.19 │ chops looks         │\n",
      "├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤\n",
      "│  7 │  -1.58 │ dull tagline            │    2.21 │ note performance    │\n",
      "├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤\n",
      "│  8 │  -1.58 │ contrived overblown     │    2.34 │ suck suck           │\n",
      "├────┼────────┼─────────────────────────┼─────────┼─────────────────────┤\n",
      "│  9 │  -1.56 │ film barely             │    2.57 │ funny entertaining  │\n",
      "╘════╧════════╧═════════════════════════╧═════════╧═════════════════════╛\n",
      "============ Sentiment Score:  2\n",
      "╒════╤════════╤═══════════════╤═════════╤════════════════════╕\n",
      "│    │   Most │ Likely        │   Least │ Likely             │\n",
      "╞════╪════════╪═══════════════╪═════════╪════════════════════╡\n",
      "│  0 │  -2.79 │ delightful    │    1.76 │ sweet smile        │\n",
      "├────┼────────┼───────────────┼─────────┼────────────────────┤\n",
      "│  1 │  -2.56 │ beautiful     │    1.83 │ hours does         │\n",
      "├────┼────────┼───────────────┼─────────┼────────────────────┤\n",
      "│  2 │  -2.53 │ wasted        │    1.84 │ smarter smarter    │\n",
      "├────┼────────┼───────────────┼─────────┼────────────────────┤\n",
      "│  3 │  -2.52 │ beautifully   │    1.85 │ budget movie       │\n",
      "├────┼────────┼───────────────┼─────────┼────────────────────┤\n",
      "│  4 │  -2.42 │ stunning      │    1.89 │ morally superior   │\n",
      "├────┼────────┼───────────────┼─────────┼────────────────────┤\n",
      "│  5 │  -2.41 │ terrific      │    1.90 │ summer divine      │\n",
      "├────┼────────┼───────────────┼─────────┼────────────────────┤\n",
      "│  6 │  -2.31 │ perfect       │    2.03 │ like big           │\n",
      "├────┼────────┼───────────────┼─────────┼────────────────────┤\n",
      "│  7 │  -2.27 │ unimaginative │    2.06 │ enjoy mindless     │\n",
      "├────┼────────┼───────────────┼─────────┼────────────────────┤\n",
      "│  8 │  -2.26 │ extraordinary │    2.13 │ inquiries          │\n",
      "├────┼────────┼───────────────┼─────────┼────────────────────┤\n",
      "│  9 │  -2.22 │ masterpiece   │    2.22 │ details ultimately │\n",
      "╘════╧════════╧═══════════════╧═════════╧════════════════════╛\n",
      "============ Sentiment Score:  3\n",
      "╒════╤════════╤═════════════════════╤═════════╤════════════════════╕\n",
      "│    │   Most │ Likely              │   Least │ Likely             │\n",
      "╞════╪════════╪═════════════════════╪═════════╪════════════════════╡\n",
      "│  0 │  -2.42 │ devoid              │    1.97 │ enjoyment          │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  1 │  -2.00 │ film offers         │    1.98 │ characters inhabit │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  2 │  -1.96 │ rrb film            │    2.00 │ movie looking      │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  3 │  -1.93 │ zings               │    2.02 │ variety tones      │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  4 │  -1.89 │ unfulfilling        │    2.07 │ stimulating        │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  5 │  -1.89 │ unless              │    2.12 │ like lead          │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  6 │  -1.86 │ heartfelt hilarious │    2.14 │ film ambitious     │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  7 │  -1.86 │ amusing tender      │    2.22 │ positive           │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  8 │  -1.85 │ delightful witty    │    2.47 │ hard resist        │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  9 │  -1.85 │ irresistible blend  │    2.50 │ half bad           │\n",
      "╘════╧════════╧═════════════════════╧═════════╧════════════════════╛\n",
      "============ Sentiment Score:  4\n",
      "╒════╤════════╤══════════════════════════╤═════════╤════════════════╕\n",
      "│    │   Most │ Likely                   │   Least │ Likely         │\n",
      "╞════╪════════╪══════════════════════════╪═════════╪════════════════╡\n",
      "│  0 │  -1.50 │ devastating experience   │    2.07 │ breathtakingly │\n",
      "├────┼────────┼──────────────────────────┼─────────┼────────────────┤\n",
      "│  1 │  -1.42 │ sickly                   │    2.11 │ riveted        │\n",
      "├────┼────────┼──────────────────────────┼─────────┼────────────────┤\n",
      "│  2 │  -1.38 │ unless                   │    2.13 │ superbly       │\n",
      "├────┼────────┼──────────────────────────┼─────────┼────────────────┤\n",
      "│  3 │  -1.27 │ vividly vividly          │    2.16 │ dazzling       │\n",
      "├────┼────────┼──────────────────────────┼─────────┼────────────────┤\n",
      "│  4 │  -1.25 │ remarkably unpretentious │    2.17 │ brilliant      │\n",
      "├────┼────────┼──────────────────────────┼─────────┼────────────────┤\n",
      "│  5 │  -1.24 │ make worth               │    2.18 │ exquisite      │\n",
      "├────┼────────┼──────────────────────────┼─────────┼────────────────┤\n",
      "│  6 │  -1.23 │ uneven                   │    2.21 │ perfection     │\n",
      "├────┼────────┼──────────────────────────┼─────────┼────────────────┤\n",
      "│  7 │  -1.21 │ entertainment comes      │    2.21 │ splendid       │\n",
      "├────┼────────┼──────────────────────────┼─────────┼────────────────┤\n",
      "│  8 │  -1.19 │ border                   │    2.24 │ beautifully    │\n",
      "├────┼────────┼──────────────────────────┼─────────┼────────────────┤\n",
      "│  9 │  -1.17 │ girls swim               │    2.53 │ masterpiece    │\n",
      "╘════╧════════╧══════════════════════════╧═════════╧════════════════╛\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>classifier</th>\n",
       "      <th>vectorizer</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>mnb</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.579779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>mnb</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.593444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>svm</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.628302</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  classifier vectorizer     score\n",
       "0        mnb         V1  0.579779\n",
       "1        mnb         V1  0.593444\n",
       "2        svm         V1  0.628302"
      ]
     },
     "execution_count": 149,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec = bigram_tv_v3\n",
    "classifier = svm\n",
    "\n",
    "model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "return_features(vec, model)\n",
    "\n",
    "df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 150,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============ Sentiment Score:  0\n",
      "╒════╤════════╤═════════════════╤═════════╤════════════════╕\n",
      "│    │   Most │ Likely          │   Least │ Likely         │\n",
      "╞════╪════════╪═════════════════╪═════════╪════════════════╡\n",
      "│  0 │  -1.47 │ stupid stupid   │    1.83 │ disaster       │\n",
      "├────┼────────┼─────────────────┼─────────┼────────────────┤\n",
      "│  1 │  -1.14 │ movie does      │    1.83 │ unmemorable    │\n",
      "├────┼────────┼─────────────────┼─────────┼────────────────┤\n",
      "│  2 │  -1.13 │ worse worse     │    1.87 │ awful          │\n",
      "├────┼────────┼─────────────────┼─────────┼────────────────┤\n",
      "│  3 │  -1.06 │ plain boring    │    1.87 │ unappealing    │\n",
      "├────┼────────┼─────────────────┼─────────┼────────────────┤\n",
      "│  4 │  -1.06 │ fails fails     │    1.89 │ premise just   │\n",
      "├────┼────────┼─────────────────┼─────────┼────────────────┤\n",
      "│  5 │  -1.03 │ waste waste     │    1.97 │ worst          │\n",
      "├────┼────────┼─────────────────┼─────────┼────────────────┤\n",
      "│  6 │  -0.98 │ comedy year     │    1.97 │ unwatchable    │\n",
      "├────┼────────┼─────────────────┼─────────┼────────────────┤\n",
      "│  7 │  -0.96 │ badly badly     │    1.97 │ ugly           │\n",
      "├────┼────────┼─────────────────┼─────────┼────────────────┤\n",
      "│  8 │  -0.95 │ tedious tedious │    2.12 │ pathetic       │\n",
      "├────┼────────┼─────────────────┼─────────┼────────────────┤\n",
      "│  9 │  -0.93 │ zero zero       │    2.30 │ disappointment │\n",
      "╘════╧════════╧═════════════════╧═════════╧════════════════╛\n",
      "============ Sentiment Score:  1\n",
      "╒════╤════════╤═════════════════════╤═════════╤════════════════════╕\n",
      "│    │   Most │ Likely              │   Least │ Likely             │\n",
      "╞════╪════════╪═════════════════════╪═════════╪════════════════════╡\n",
      "│  0 │  -1.54 │ pretentious mess    │    1.82 │ funny entertaining │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  1 │  -1.49 │ hard stop           │    1.84 │ fails              │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  2 │  -1.41 │ film barely         │    1.88 │ sadly              │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  3 │  -1.37 │ lacks substance     │    1.88 │ mushy              │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  4 │  -1.35 │ dull tagline        │    1.90 │ foul               │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  5 │  -1.31 │ powerful            │    1.95 │ bland              │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  6 │  -1.30 │ like movie          │    2.01 │ ridiculous         │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  7 │  -1.29 │ unnecessary retread │    2.09 │ lacking            │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  8 │  -1.28 │ does rely           │    2.23 │ lack               │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  9 │  -1.28 │ care really         │    2.75 │ lacks              │\n",
      "╘════╧════════╧═════════════════════╧═════════╧════════════════════╛\n",
      "============ Sentiment Score:  2\n",
      "╒════╤════════╤═════════════╤═════════╤═══════════════════════════╕\n",
      "│    │   Most │ Likely      │   Least │ Likely                    │\n",
      "╞════╪════════╪═════════════╪═════════╪═══════════════════════════╡\n",
      "│  0 │  -2.88 │ perfect     │    1.57 │ really really             │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────────────┤\n",
      "│  1 │  -2.81 │ best        │    1.58 │ suspense suspense         │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────────────┤\n",
      "│  2 │  -2.76 │ brilliant   │    1.58 │ dialogue dialogue         │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────────────┤\n",
      "│  3 │  -2.75 │ beautiful   │    1.60 │ good cheesy               │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────────────┤\n",
      "│  4 │  -2.74 │ good        │    1.64 │ surprising surprising     │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────────────┤\n",
      "│  5 │  -2.69 │ beautifully │    1.69 │ rrb lrb                   │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────────────┤\n",
      "│  6 │  -2.65 │ delightful  │    1.72 │ smarter smarter           │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────────────┤\n",
      "│  7 │  -2.63 │ hilarious   │    1.75 │ performances performances │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────────────┤\n",
      "│  8 │  -2.60 │ funny       │    1.87 │ budget movie              │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────────────┤\n",
      "│  9 │  -2.58 │ great       │    1.92 │ enjoy mindless            │\n",
      "╘════╧════════╧═════════════╧═════════╧═══════════════════════════╛\n",
      "============ Sentiment Score:  3\n",
      "╒════╤════════╤═════════════════════╤═════════╤═════════════╕\n",
      "│    │   Most │ Likely              │   Least │ Likely      │\n",
      "╞════╪════════╪═════════════════════╪═════════╪═════════════╡\n",
      "│  0 │  -1.90 │ film offers         │    1.87 │ interesting │\n",
      "├────┼────────┼─────────────────────┼─────────┼─────────────┤\n",
      "│  1 │  -1.85 │ lacking             │    1.88 │ bittersweet │\n",
      "├────┼────────┼─────────────────────┼─────────┼─────────────┤\n",
      "│  2 │  -1.74 │ devoid              │    1.90 │ enjoy       │\n",
      "├────┼────────┼─────────────────────┼─────────┼─────────────┤\n",
      "│  3 │  -1.67 │ lacks               │    1.96 │ enjoyment   │\n",
      "├────┼────────┼─────────────────────┼─────────┼─────────────┤\n",
      "│  4 │  -1.58 │ loses               │    1.99 │ appealing   │\n",
      "├────┼────────┼─────────────────────┼─────────┼─────────────┤\n",
      "│  5 │  -1.56 │ story fascinating   │    2.01 │ hard resist │\n",
      "├────┼────────┼─────────────────────┼─────────┼─────────────┤\n",
      "│  6 │  -1.52 │ delightful witty    │    2.04 │ decent      │\n",
      "├────┼────────┼─────────────────────┼─────────┼─────────────┤\n",
      "│  7 │  -1.49 │ amusing tender      │    2.07 │ positive    │\n",
      "├────┼────────┼─────────────────────┼─────────┼─────────────┤\n",
      "│  8 │  -1.48 │ earnest earnest     │    2.12 │ pleasant    │\n",
      "├────┼────────┼─────────────────────┼─────────┼─────────────┤\n",
      "│  9 │  -1.47 │ heartfelt hilarious │    2.30 │ good        │\n",
      "╘════╧════════╧═════════════════════╧═════════╧═════════════╛\n",
      "============ Sentiment Score:  4\n",
      "╒════╤════════╤═══════════════════════════╤═════════╤═══════════════╕\n",
      "│    │   Most │ Likely                    │   Least │ Likely        │\n",
      "╞════╪════════╪═══════════════════════════╪═════════╪═══════════════╡\n",
      "│  0 │  -1.50 │ excellent film            │    2.09 │ magnificent   │\n",
      "├────┼────────┼───────────────────────────┼─────────┼───────────────┤\n",
      "│  1 │  -1.48 │ moving moving             │    2.09 │ wonderful     │\n",
      "├────┼────────┼───────────────────────────┼─────────┼───────────────┤\n",
      "│  2 │  -1.17 │ best best                 │    2.16 │ extraordinary │\n",
      "├────┼────────┼───────────────────────────┼─────────┼───────────────┤\n",
      "│  3 │  -1.16 │ entertainment comes       │    2.18 │ dazzling      │\n",
      "├────┼────────┼───────────────────────────┼─────────┼───────────────┤\n",
      "│  4 │  -1.13 │ performances performances │    2.19 │ best          │\n",
      "├────┼────────┼───────────────────────────┼─────────┼───────────────┤\n",
      "│  5 │  -1.12 │ devastating experience    │    2.27 │ beautifully   │\n",
      "├────┼────────┼───────────────────────────┼─────────┼───────────────┤\n",
      "│  6 │  -1.10 │ stunning stunning         │    2.30 │ stunning      │\n",
      "├────┼────────┼───────────────────────────┼─────────┼───────────────┤\n",
      "│  7 │  -1.10 │ fascinating fascinating   │    2.32 │ terrific      │\n",
      "├────┼────────┼───────────────────────────┼─────────┼───────────────┤\n",
      "│  8 │  -1.08 │ intelligent intelligent   │    2.41 │ masterpiece   │\n",
      "├────┼────────┼───────────────────────────┼─────────┼───────────────┤\n",
      "│  9 │  -1.04 │ bond solid                │    2.48 │ brilliant     │\n",
      "╘════╧════════╧═══════════════════════════╧═════════╧═══════════════╛\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>classifier</th>\n",
       "      <th>vectorizer</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>mnb</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.579779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>mnb</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.593444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>svm</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.628302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>svm</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.629617</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  classifier vectorizer     score\n",
       "0        mnb         V1  0.579779\n",
       "1        mnb         V1  0.593444\n",
       "2        svm         V1  0.628302\n",
       "3        svm         V1  0.629617"
      ]
     },
     "execution_count": 150,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec = bigram_tv_v4\n",
    "classifier = svm\n",
    "\n",
    "model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "return_features(vec, model)\n",
    "\n",
    "df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 151,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============ Sentiment Score:  0\n",
      "╒════╤════════╤════════════════════════╤═════════╤════════════════╕\n",
      "│    │   Most │ Likely                 │   Least │ Likely         │\n",
      "╞════╪════════╪════════════════════════╪═════════╪════════════════╡\n",
      "│  0 │  -1.30 │ plain boring           │    1.94 │ repugnant      │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────────┤\n",
      "│  1 │  -1.27 │ stupid stupid          │    1.95 │ unmemorable    │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────────┤\n",
      "│  2 │  -1.19 │ movie does             │    1.99 │ pathetic       │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────────┤\n",
      "│  3 │  -1.09 │ badly badly            │    2.03 │ disgusting     │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────────┤\n",
      "│  4 │  -1.07 │ comedy year            │    2.06 │ worst          │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────────┤\n",
      "│  5 │  -1.06 │ incompetent incoherent │    2.06 │ premise just   │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────────┤\n",
      "│  6 │  -1.03 │ fails fails            │    2.07 │ unwatchable    │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────────┤\n",
      "│  7 │  -1.01 │ movie horrible         │    2.08 │ awful          │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────────┤\n",
      "│  8 │  -1.00 │ works minutes          │    2.08 │ unappealing    │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────────┤\n",
      "│  9 │  -0.99 │ worse worse            │    2.60 │ disappointment │\n",
      "╘════╧════════╧════════════════════════╧═════════╧════════════════╛\n",
      "============ Sentiment Score:  1\n",
      "╒════╤════════╤═════════════════════╤═════════╤════════════════════╕\n",
      "│    │   Most │ Likely              │   Least │ Likely             │\n",
      "╞════╪════════╪═════════════════════╪═════════╪════════════════════╡\n",
      "│  0 │  -1.87 │ lacks substance     │    1.91 │ loses              │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  1 │  -1.78 │ pretentious mess    │    1.93 │ lack               │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  2 │  -1.78 │ hard stop           │    1.93 │ absurdity          │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  3 │  -1.57 │ contrived overblown │    1.96 │ ridiculous         │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  4 │  -1.52 │ characters team     │    1.96 │ foul               │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  5 │  -1.50 │ powerful            │    1.98 │ special final      │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  6 │  -1.45 │ dull tagline        │    2.07 │ funny entertaining │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  7 │  -1.44 │ does disgrace       │    2.12 │ sadly              │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  8 │  -1.42 │ acted poorly        │    2.27 │ lacking            │\n",
      "├────┼────────┼─────────────────────┼─────────┼────────────────────┤\n",
      "│  9 │  -1.42 │ justice awfulness   │    2.77 │ lacks              │\n",
      "╘════╧════════╧═════════════════════╧═════════╧════════════════════╛\n",
      "============ Sentiment Score:  2\n",
      "╒════╤════════╤═════════════╤═════════╤═══════════════════╕\n",
      "│    │   Most │ Likely      │   Least │ Likely            │\n",
      "╞════╪════════╪═════════════╪═════════╪═══════════════════╡\n",
      "│  0 │  -2.78 │ beautiful   │    1.70 │ rrb lrb           │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────┤\n",
      "│  1 │  -2.76 │ perfect     │    1.75 │ funny beautifully │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────┤\n",
      "│  2 │  -2.71 │ delightful  │    1.77 │ watchable hardly  │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────┤\n",
      "│  3 │  -2.67 │ beautifully │    1.79 │ funny laugh       │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────┤\n",
      "│  4 │  -2.61 │ brilliant   │    1.81 │ good cheesy       │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────┤\n",
      "│  5 │  -2.60 │ best        │    1.85 │ smarter smarter   │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────┤\n",
      "│  6 │  -2.58 │ remarkable  │    1.86 │ budget movie      │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────┤\n",
      "│  7 │  -2.57 │ terrific    │    1.90 │ morally superior  │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────┤\n",
      "│  8 │  -2.55 │ good        │    1.96 │ avoid fatal       │\n",
      "├────┼────────┼─────────────┼─────────┼───────────────────┤\n",
      "│  9 │  -2.53 │ great       │    1.96 │ enjoy mindless    │\n",
      "╘════╧════════╧═════════════╧═════════╧═══════════════════╛\n",
      "============ Sentiment Score:  3\n",
      "╒════╤════════╤═════════════════════╤═════════╤═══════════════╕\n",
      "│    │   Most │ Likely              │   Least │ Likely        │\n",
      "╞════╪════════╪═════════════════════╪═════════╪═══════════════╡\n",
      "│  0 │  -2.30 │ devoid              │    1.90 │ charm         │\n",
      "├────┼────────┼─────────────────────┼─────────┼───────────────┤\n",
      "│  1 │  -2.17 │ film offers         │    1.94 │ decent        │\n",
      "├────┼────────┼─────────────────────┼─────────┼───────────────┤\n",
      "│  2 │  -1.93 │ lacking             │    1.94 │ confidence    │\n",
      "├────┼────────┼─────────────────────┼─────────┼───────────────┤\n",
      "│  3 │  -1.86 │ loses               │    1.95 │ method fails  │\n",
      "├────┼────────┼─────────────────────┼─────────┼───────────────┤\n",
      "│  4 │  -1.76 │ lacks               │    1.95 │ movie looking │\n",
      "├────┼────────┼─────────────────────┼─────────┼───────────────┤\n",
      "│  5 │  -1.76 │ amusing tender      │    2.02 │ hard resist   │\n",
      "├────┼────────┼─────────────────────┼─────────┼───────────────┤\n",
      "│  6 │  -1.76 │ heartfelt hilarious │    2.04 │ good          │\n",
      "├────┼────────┼─────────────────────┼─────────┼───────────────┤\n",
      "│  7 │  -1.70 │ reminds animation   │    2.08 │ enjoyment     │\n",
      "├────┼────────┼─────────────────────┼─────────┼───────────────┤\n",
      "│  8 │  -1.63 │ seagal sharp        │    2.12 │ positive      │\n",
      "├────┼────────┼─────────────────────┼─────────┼───────────────┤\n",
      "│  9 │  -1.62 │ story fascinating   │    2.27 │ half bad      │\n",
      "╘════╧════════╧═════════════════════╧═════════╧═══════════════╛\n",
      "============ Sentiment Score:  4\n",
      "╒════╤════════╤════════════════════════╤═════════╤═══════════════╕\n",
      "│    │   Most │ Likely                 │   Least │ Likely        │\n",
      "╞════╪════════╪════════════════════════╪═════════╪═══════════════╡\n",
      "│  0 │  -1.56 │ excellent film         │    2.16 │ exquisite     │\n",
      "├────┼────────┼────────────────────────┼─────────┼───────────────┤\n",
      "│  1 │  -1.36 │ moving moving          │    2.16 │ magnificent   │\n",
      "├────┼────────┼────────────────────────┼─────────┼───────────────┤\n",
      "│  2 │  -1.28 │ uneven                 │    2.17 │ beautifully   │\n",
      "├────┼────────┼────────────────────────┼─────────┼───────────────┤\n",
      "│  3 │  -1.27 │ devastating experience │    2.17 │ terrific      │\n",
      "├────┼────────┼────────────────────────┼─────────┼───────────────┤\n",
      "│  4 │  -1.24 │ entertainment comes    │    2.19 │ excellent     │\n",
      "├────┼────────┼────────────────────────┼─────────┼───────────────┤\n",
      "│  5 │  -1.21 │ turns gripping         │    2.30 │ masterfully   │\n",
      "├────┼────────┼────────────────────────┼─────────┼───────────────┤\n",
      "│  6 │  -1.20 │ films thoughtful       │    2.30 │ brilliant     │\n",
      "├────┼────────┼────────────────────────┼─────────┼───────────────┤\n",
      "│  7 │  -1.16 │ great fiery            │    2.30 │ splendid      │\n",
      "├────┼────────┼────────────────────────┼─────────┼───────────────┤\n",
      "│  8 │  -1.12 │ make worth             │    2.34 │ extraordinary │\n",
      "├────┼────────┼────────────────────────┼─────────┼───────────────┤\n",
      "│  9 │  -1.11 │ story sweet            │    2.59 │ masterpiece   │\n",
      "╘════╧════════╧════════════════════════╧═════════╧═══════════════╛\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>classifier</th>\n",
       "      <th>vectorizer</th>\n",
       "      <th>score</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>mnb</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.579779</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>mnb</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.593444</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>svm</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.628302</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>svm</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.629617</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>svm</td>\n",
       "      <td>V1</td>\n",
       "      <td>0.630428</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  classifier vectorizer     score\n",
       "0        mnb         V1  0.579779\n",
       "1        mnb         V1  0.593444\n",
       "2        svm         V1  0.628302\n",
       "3        svm         V1  0.629617\n",
       "4        svm         V1  0.630428"
      ]
     },
     "execution_count": 151,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "vec = bigram_tv_v5\n",
    "classifier = svm\n",
    "\n",
    "model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "return_features(vec, model)\n",
    "\n",
    "df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 152,
   "metadata": {},
   "outputs": [],
   "source": [
    "# \tclassifier\tvectorizer\tscore\n",
    "# 0\tmnb\tV1\t0.583317\n",
    "# 1\tmnb\tV1\t0.594499\n",
    "# 2\tsvm\tV1\t0.629566\n",
    "# 3\tsvm\tV1\t0.636662"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 1 -- MNB & SVM with Vectorizer 1\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# vec = unigram_bool_cv_v1\n",
    "# classifier = mnb\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "\n",
    "# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V1', 'score': score})\n",
    "# df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# vec = unigram_bool_cv_v1\n",
    "# classifier = svm\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "\n",
    "# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V1', 'score': score})\n",
    "# df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "NOTES: Very interesting!! MNB is very cluttered with numbers. SVM is not. "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 2 -- MNB & SVM with Vectorizer 2\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vec = unigram_bool_cv_v2\n",
    "# classifier = mnb\n",
    "\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "\n",
    "# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V2', 'score': score})\n",
    "# df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# vec = unigram_bool_cv_v2\n",
    "# classifier = svm\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "\n",
    "# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V2', 'score': score})\n",
    "# df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 3 -- MNB & SVM with Vectorizer 3\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vec = unigram_cv\n",
    "# classifier = mnb\n",
    "\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "\n",
    "# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V3', 'score': score})\n",
    "# df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 38,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vec = unigram_cv\n",
    "# classifier = svm\n",
    "\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "\n",
    "# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V3', 'score': score})\n",
    "# df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 4 -- MNB & SVM with Vectorizer 4\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 39,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vec = bigram_cv\n",
    "# classifier = mnb\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V4', 'score': score})\n",
    "\n",
    "# classifier = svm\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V4', 'score': score})\n",
    "# df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 40,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 5 -- MNB & SVM with Vectorizer 5\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============ Sentiment Score:  0\n",
      "╒════╤════════╤═══════════════════╤═════════╤════════════╕\n",
      "│    │   Most │ Likely            │   Least │ Likely     │\n",
      "╞════╪════════╪═══════════════════╪═════════╪════════════╡\n",
      "│  0 │ -11.16 │ aaliyah           │   -6.62 │ time       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  1 │ -11.16 │ abagnale          │   -6.61 │ characters │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  2 │ -11.16 │ abagnale antics   │   -6.60 │ minutes    │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  3 │ -11.16 │ abandon political │   -6.60 │ story      │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  4 │ -11.16 │ abandoned         │   -6.59 │ comedy     │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  5 │ -11.16 │ abbreviated       │   -6.37 │ just       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  6 │ -11.16 │ abel              │   -5.82 │ like       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  7 │ -11.16 │ abel ferrara      │   -5.65 │ bad        │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  8 │ -11.16 │ abhors            │   -5.51 │ film       │\n",
      "├────┼────────┼───────────────────┼─────────┼────────────┤\n",
      "│  9 │ -11.16 │ abiding           │   -5.00 │ movie      │\n",
      "╘════╧════════╧═══════════════════╧═════════╧════════════╛\n",
      "============ Sentiment Score:  1\n",
      "╒════╤════════╤═══════════════════╤═════════╤══════════╕\n",
      "│    │   Most │ Likely            │   Least │ Likely   │\n",
      "╞════╪════════╪═══════════════════╪═════════╪══════════╡\n",
      "│  0 │ -11.87 │ abagnale          │   -6.27 │ lrb      │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────┤\n",
      "│  1 │ -11.87 │ abagnale antics   │   -6.23 │ bad      │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────┤\n",
      "│  2 │ -11.87 │ abandon political │   -6.18 │ rrb      │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────┤\n",
      "│  3 │ -11.87 │ abbott            │   -6.17 │ little   │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────┤\n",
      "│  4 │ -11.87 │ abbott ernest     │   -6.02 │ story    │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────┤\n",
      "│  5 │ -11.87 │ abdul             │   -5.97 │ just     │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────┤\n",
      "│  6 │ -11.87 │ abdul malik       │   -5.96 │ does     │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────┤\n",
      "│  7 │ -11.87 │ abel              │   -5.57 │ like     │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────┤\n",
      "│  8 │ -11.87 │ abel ferrara      │   -5.22 │ film     │\n",
      "├────┼────────┼───────────────────┼─────────┼──────────┤\n",
      "│  9 │ -11.87 │ abilities         │   -5.09 │ movie    │\n",
      "╘════╧════════╧═══════════════════╧═════════╧══════════╛\n",
      "============ Sentiment Score:  2\n",
      "╒════╤════════╤════════════════════════╤═════════╤════════════╕\n",
      "│    │   Most │ Likely                 │   Least │ Likely     │\n",
      "╞════╪════════╪════════════════════════╪═════════╪════════════╡\n",
      "│  0 │ -12.29 │ abandon theater        │   -6.39 │ movies     │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────┤\n",
      "│  1 │ -12.29 │ ability shock          │   -6.36 │ characters │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────┤\n",
      "│  2 │ -12.29 │ ability think          │   -6.24 │ life       │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────┤\n",
      "│  3 │ -12.29 │ able performances      │   -6.24 │ time       │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────┤\n",
      "│  4 │ -12.29 │ able project           │   -6.02 │ lrb        │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────┤\n",
      "│  5 │ -12.29 │ abroad                 │   -5.94 │ story      │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────┤\n",
      "│  6 │ -12.29 │ absolutely earned      │   -5.77 │ rrb        │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────┤\n",
      "│  7 │ -12.29 │ absolutely inescapably │   -5.74 │ like       │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────┤\n",
      "│  8 │ -12.29 │ absorbing characters   │   -5.19 │ movie      │\n",
      "├────┼────────┼────────────────────────┼─────────┼────────────┤\n",
      "│  9 │ -12.29 │ absorbing look         │   -5.14 │ film       │\n",
      "╘════╧════════╧════════════════════════╧═════════╧════════════╛\n",
      "============ Sentiment Score:  3\n",
      "╒════╤════════╤═════════════════╤═════════╤══════════╕\n",
      "│    │   Most │ Likely          │   Least │ Likely   │\n",
      "╞════╪════════╪═════════════════╪═════════╪══════════╡\n",
      "│  0 │ -11.99 │ aaliyah         │   -6.26 │ love     │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────┤\n",
      "│  1 │ -11.99 │ abandon theater │   -6.26 │ lrb      │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────┤\n",
      "│  2 │ -11.99 │ abbreviated     │   -6.18 │ life     │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────┤\n",
      "│  3 │ -11.99 │ abc             │   -6.18 │ rrb      │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────┤\n",
      "│  4 │ -11.99 │ abhorrent       │   -6.07 │ like     │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────┤\n",
      "│  5 │ -11.99 │ abhors          │   -6.01 │ funny    │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────┤\n",
      "│  6 │ -11.99 │ ability shock   │   -6.01 │ story    │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────┤\n",
      "│  7 │ -11.99 │ able accomplish │   -5.61 │ good     │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────┤\n",
      "│  8 │ -11.99 │ able better     │   -5.29 │ movie    │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────┤\n",
      "│  9 │ -11.99 │ able project    │   -4.99 │ film     │\n",
      "╘════╧════════╧═════════════════╧═════════╧══════════╛\n",
      "============ Sentiment Score:  4\n",
      "╒════╤════════╤═════════════════╤═════════╤══════════════╕\n",
      "│    │   Most │ Likely          │   Least │ Likely       │\n",
      "╞════╪════════╪═════════════════╪═════════╪══════════════╡\n",
      "│  0 │ -11.27 │ aaliyah         │   -6.46 │ performance  │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────────┤\n",
      "│  1 │ -11.27 │ abagnale        │   -6.41 │ comedy       │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────────┤\n",
      "│  2 │ -11.27 │ abagnale antics │   -6.36 │ great        │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────────┤\n",
      "│  3 │ -11.27 │ abandon scripts │   -6.34 │ story        │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────────┤\n",
      "│  4 │ -11.27 │ abandon theater │   -6.29 │ performances │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────────┤\n",
      "│  5 │ -11.27 │ abandoned       │   -6.01 │ good         │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────────┤\n",
      "│  6 │ -11.27 │ abbott          │   -5.87 │ funny        │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────────┤\n",
      "│  7 │ -11.27 │ abbott ernest   │   -5.78 │ best         │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────────┤\n",
      "│  8 │ -11.27 │ abbreviated     │   -5.40 │ movie        │\n",
      "├────┼────────┼─────────────────┼─────────┼──────────────┤\n",
      "│  9 │ -11.27 │ abc             │   -4.89 │ film         │\n",
      "╘════╧════════╧═════════════════╧═════════╧══════════════╛\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'big_df' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-43-64413d29c6fd>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mscore\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mreport\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_model\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mX\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0my\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m4\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'0'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'1'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'2'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'3'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'4'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mclassifier\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvec\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0mreturn_features\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvec\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mupdate_big_df\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mbig_df\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m{\u001b[0m \u001b[0;34m'classifier'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'mnb'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'vectorizer'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;34m'V5'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'score'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mscore\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mclassifier\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msvm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mNameError\u001b[0m: name 'big_df' is not defined"
     ]
    }
   ],
   "source": [
    "vec = bigram_cv_v2\n",
    "classifier = mnb\n",
    "\n",
    "\n",
    "model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "return_features(vec, model)\n",
    "df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V5', 'score': score})\n",
    "\n",
    "classifier = svm\n",
    "\n",
    "model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V5', 'score': score})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 6 -- MNB & SVM with Vectorizer 6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vec = unigram_tv\n",
    "# classifier = mnb\n",
    "\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V6', 'score': score})\n",
    "\n",
    "# classifier = svm\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V6', 'score': score})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 7 -- MNB & SVM with Vectorizer 7"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vec = unigram_tv_v2\n",
    "# classifier = mnb\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V7', 'score': score})\n",
    "\n",
    "# classifier = svm\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V7', 'score': score})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 8 -- MNB & SVM with Vectorizer 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# vec = bigram_tv\n",
    "# classifier = mnb\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V8', 'score': score})\n",
    "\n",
    "# classifier = svm\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V8', 'score': score})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 9 -- MNB & SVM with Vectorizer 9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "# vec = bigram_tv_v2\n",
    "# classifier = mnb\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'mnb', 'vectorizer': 'V9', 'score': score})\n",
    "\n",
    "# classifier = svm\n",
    "\n",
    "# model, score, report = get_model(X,y,[0,1,2,3,4],['0','1','2','3','4'], classifier, vec)\n",
    "# return_features(vec, model)\n",
    "# df = update_big_df(big_df,{ 'classifier': 'svm', 'vectorizer': 'V9', 'score': score})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "156060 (117045, 78021) (66292, 78021)\n",
      "prediction complete\n"
     ]
    }
   ],
   "source": [
    "train=pd.read_csv(\"kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values\n",
    "\n",
    "# pred_vec = bigram_tv_v3 # 60.4\n",
    "# pred_vec = bigram_tv_v4 # 60.569\n",
    "pred_vec = bigram_tv_v4 # removing words < 2 60.584\n",
    "\n",
    "\n",
    "test = pd.read_csv(\"kaggle-sentiment/test.tsv\", delimiter='\\t')\n",
    "k_id = test['PhraseId']\n",
    "k_text = test['Phrase']\n",
    "\n",
    "# k_vec = bigram_tv_v3.transform(k_text)\n",
    "# k_vec\n",
    "\n",
    "def get_kaggle_test_train_vec(X,y,vectorizer):\n",
    "    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=None, random_state=0)\n",
    "    X_train_vec = vectorizer.fit_transform(X_train)\n",
    "#     X_test_vec = vectorizer.transform(X_test)\n",
    "    return X_train_vec, y_train,\n",
    "\n",
    "def do_the_kaggle(X,y,vec):\n",
    "    X_train_vec, y_train = get_kaggle_test_train_vec(X,y,vec)\n",
    "    svm_clf = LinearSVC(C=1)\n",
    "    k_vec = pred_vec.transform(k_text)\n",
    "    print(len(X), X_train_vec.shape, k_vec.shape)\n",
    "\n",
    "    prediction = svm_clf.fit(X_train_vec,y_train).predict(k_vec)\n",
    "    kaggle_submission = zip(k_id, prediction)\n",
    "    outf=open('kaggle_submission_linearSVC_v10.csv', 'w')\n",
    "    outf.write('PhraseId,Sentiment\\n')\n",
    "    for x, value in enumerate(kaggle_submission): outf.write(str(value[0]) + ',' + str(value[1]) + '\\n')\n",
    "    outf.close()\n",
    "    print('prediction complete')\n",
    "\n",
    "do_the_kaggle(X,y,pred_vec)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
