{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from nltk.tokenize.casual import casual_tokenize\n",
    "tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)\n",
    "from sklearn.preprocessing import MinMaxScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "15606"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "og_df = pd.read_csv('kaggle_csv.csv')\n",
    "df_sm = og_df.copy()\n",
    "df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)\n",
    "df = df_sm.copy()\n",
    "len(df_sm)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dividing_vec(df, num):\n",
    "    df['PoN'] = [0 if x < num else 1 for x in df['Sentiment']]\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    mask = df['PoN'].astype(bool).values\n",
    "    n_centroid = tfidf_docs[mask].mean(axis=0)\n",
    "    p_centroid = tfidf_docs[~mask].mean(axis=0)\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    return negativity_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "negativeness_score_01 = get_dividing_vec(df, 2) # splits into 01, 234\n",
    "negativeness_score_34 = get_dividing_vec(df, 3) # splits into 012, 34"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def get_lda_score(negativeness_score):\n",
    "#     return MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))\n",
    "# df['lda_score'] = df.apply(lambda x: get_lda_score(x['Phrase']), axis=1)\n",
    "# df['lda_score'] = df.apply(lambda x: get_lda_score(negativeness_score), axis=1)\n",
    "# df['lda_predict'] = (df.lda_score > .5).astype(int)\n",
    "df['lda_score_01'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))\n",
    "df['lda_predict_01'] = df.apply(lambda x: (x['lda_score_01'] >.5), axis=1)\n",
    "\n",
    "df['lda_score_34'] = MinMaxScaler().fit_transform(negativeness_score_34.reshape(-1,1))\n",
    "df['lda_predict_34'] = df.apply(lambda x: (x['lda_score_34'] >.5), axis=1)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>lda_score_01</th>\n",
       "      <th>lda_predict_01</th>\n",
       "      <th>lda_score_34</th>\n",
       "      <th>lda_predict_34</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>128037</td>\n",
       "      <td>128037</td>\n",
       "      <td>128038</td>\n",
       "      <td>6887</td>\n",
       "      <td>as the main character suggests , ` what if</td>\n",
       "      <td>3</td>\n",
       "      <td>0.625832</td>\n",
       "      <td>True</td>\n",
       "      <td>0.261945</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5192</td>\n",
       "      <td>5192</td>\n",
       "      <td>5193</td>\n",
       "      <td>206</td>\n",
       "      <td>well-wrought story</td>\n",
       "      <td>4</td>\n",
       "      <td>0.870760</td>\n",
       "      <td>True</td>\n",
       "      <td>0.209028</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>50057</td>\n",
       "      <td>50057</td>\n",
       "      <td>50058</td>\n",
       "      <td>2457</td>\n",
       "      <td>pack raw dough</td>\n",
       "      <td>2</td>\n",
       "      <td>0.858039</td>\n",
       "      <td>True</td>\n",
       "      <td>0.173733</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>109259</td>\n",
       "      <td>109259</td>\n",
       "      <td>109260</td>\n",
       "      <td>5785</td>\n",
       "      <td>into the editing room</td>\n",
       "      <td>2</td>\n",
       "      <td>0.769221</td>\n",
       "      <td>True</td>\n",
       "      <td>0.192198</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>73349</td>\n",
       "      <td>73349</td>\n",
       "      <td>73350</td>\n",
       "      <td>3748</td>\n",
       "      <td>concerned with morality</td>\n",
       "      <td>2</td>\n",
       "      <td>0.863717</td>\n",
       "      <td>True</td>\n",
       "      <td>0.231941</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25979</td>\n",
       "      <td>25979</td>\n",
       "      <td>25980</td>\n",
       "      <td>1189</td>\n",
       "      <td>Spy</td>\n",
       "      <td>2</td>\n",
       "      <td>0.882761</td>\n",
       "      <td>True</td>\n",
       "      <td>0.166748</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28724</td>\n",
       "      <td>28724</td>\n",
       "      <td>28725</td>\n",
       "      <td>1331</td>\n",
       "      <td>semi-autobiographical film</td>\n",
       "      <td>2</td>\n",
       "      <td>0.873131</td>\n",
       "      <td>True</td>\n",
       "      <td>0.233286</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5064</td>\n",
       "      <td>5064</td>\n",
       "      <td>5065</td>\n",
       "      <td>198</td>\n",
       "      <td>that writer and director Burr Steers knows the...</td>\n",
       "      <td>3</td>\n",
       "      <td>0.803085</td>\n",
       "      <td>True</td>\n",
       "      <td>0.332284</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>85856</td>\n",
       "      <td>85856</td>\n",
       "      <td>85857</td>\n",
       "      <td>4443</td>\n",
       "      <td>associations you choose to make</td>\n",
       "      <td>2</td>\n",
       "      <td>0.738841</td>\n",
       "      <td>True</td>\n",
       "      <td>0.194295</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>141693</td>\n",
       "      <td>141693</td>\n",
       "      <td>141694</td>\n",
       "      <td>7686</td>\n",
       "      <td>a human volcano or</td>\n",
       "      <td>2</td>\n",
       "      <td>0.762257</td>\n",
       "      <td>True</td>\n",
       "      <td>0.265051</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>15606 rows × 9 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0  PhraseId  SentenceId  \\\n",
       "128037      128037    128038        6887   \n",
       "5192          5192      5193         206   \n",
       "50057        50057     50058        2457   \n",
       "109259      109259    109260        5785   \n",
       "73349        73349     73350        3748   \n",
       "...            ...       ...         ...   \n",
       "25979        25979     25980        1189   \n",
       "28724        28724     28725        1331   \n",
       "5064          5064      5065         198   \n",
       "85856        85856     85857        4443   \n",
       "141693      141693    141694        7686   \n",
       "\n",
       "                                                   Phrase  Sentiment  \\\n",
       "128037         as the main character suggests , ` what if          3   \n",
       "5192                                   well-wrought story          4   \n",
       "50057                                      pack raw dough          2   \n",
       "109259                              into the editing room          2   \n",
       "73349                             concerned with morality          2   \n",
       "...                                                   ...        ...   \n",
       "25979                                                 Spy          2   \n",
       "28724                          semi-autobiographical film          2   \n",
       "5064    that writer and director Burr Steers knows the...          3   \n",
       "85856                     associations you choose to make          2   \n",
       "141693                                 a human volcano or          2   \n",
       "\n",
       "        lda_score_01  lda_predict_01  lda_score_34  lda_predict_34  \n",
       "128037      0.625832            True      0.261945           False  \n",
       "5192        0.870760            True      0.209028           False  \n",
       "50057       0.858039            True      0.173733           False  \n",
       "109259      0.769221            True      0.192198           False  \n",
       "73349       0.863717            True      0.231941           False  \n",
       "...              ...             ...           ...             ...  \n",
       "25979       0.882761            True      0.166748           False  \n",
       "28724       0.873131            True      0.233286           False  \n",
       "5064        0.803085            True      0.332284           False  \n",
       "85856       0.738841            True      0.194295           False  \n",
       "141693      0.762257            True      0.265051           False  \n",
       "\n",
       "[15606 rows x 9 columns]"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "def calculate_actual(row):\n",
    "    if (row['lda_predict_01'] == False) and (row['lda_predict_34'] == False):\n",
    "        return 0\n",
    "    elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):\n",
    "        return 1\n",
    "    elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):\n",
    "        return 3\n",
    "df['actual'] = df.apply"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_dividing_vec_og(df, num):\n",
    "    df['PoN'] = [0 if x < num else 1 for x in df['Sentiment']]\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    mask = df['PoN'].astype(bool).values\n",
    "    n_centroid = tfidf_docs[mask].mean(axis=0)\n",
    "    p_centroid = tfidf_docs[~mask].mean(axis=0)\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    return negativity_score\n",
    "\n",
    "def get_dividing_vec(df, num):\n",
    "    df['PoN'] = [0 if x == num else 1 for x in df['Sentiment']]\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    mask = df['PoN'].astype(bool).values\n",
    "    n_centroid = tfidf_docs[mask].mean(axis=0)\n",
    "    p_centroid = tfidf_docs[~mask].mean(axis=0)\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    return negativity_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 46,
   "metadata": {},
   "outputs": [],
   "source": [
    "negativeness_score_0 = get_dividing_vec(df, 0) # splits into 0, 1234\n",
    "negativeness_score_1 = get_dividing_vec(df, 1) # splits into 1, 0234\n",
    "negativeness_score_2 = get_dividing_vec(df, 2) # splits into 2, 0134\n",
    "negativeness_score_3 = get_dividing_vec(df, 3) # splits into 3, 0124\n",
    "negativeness_score_4 = get_dividing_vec(df, 4) # splits into 4, 0123"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 47,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['lda_score_0'] = MinMaxScaler().fit_transform(negativeness_score_0.reshape(-1,1))\n",
    "df['lda_predict_0'] = df.apply(lambda x: (x['lda_score_0'] >.5), axis=1)\n",
    "\n",
    "df['lda_score_1'] = MinMaxScaler().fit_transform(negativeness_score_1.reshape(-1,1))\n",
    "df['lda_predict_1'] = df.apply(lambda x: (x['lda_score_1'] >.5), axis=1)\n",
    "\n",
    "df['lda_score_2'] = MinMaxScaler().fit_transform(negativeness_score_2.reshape(-1,1))\n",
    "df['lda_predict_2'] = df.apply(lambda x: (x['lda_score_2'] >.5), axis=1)\n",
    "\n",
    "df['lda_score_3'] = MinMaxScaler().fit_transform(negativeness_score_3.reshape(-1,1))\n",
    "df['lda_predict_3'] = df.apply(lambda x: (x['lda_score_3'] >.5), axis=1)\n",
    "\n",
    "df['lda_score_4'] = MinMaxScaler().fit_transform(negativeness_score_4.reshape(-1,1))\n",
    "df['lda_predict_4'] = df.apply(lambda x: (x['lda_score_4'] >.5), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "negativeness_score_01 = get_dividing_vec_og(df, 2) # splits into 4, 0123\n",
    "negativeness_score_34 = get_dividing_vec_og(df, 3) # splits into 4, 0123\n",
    "df['lda_score_01'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))\n",
    "df['lda_predict_01'] = df.apply(lambda x: (x['lda_score_01'] >.5), axis=1)\n",
    "\n",
    "df['lda_score_34'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))\n",
    "df['lda_predict_34'] = df.apply(lambda x: (x['lda_score_34'] >.5), axis=1)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  import sys\n",
      "/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "from collections import Counter\n",
    "neg_df = df[df['lda_predict_01'] == False]\n",
    "# print(len(neg_df))\n",
    "# Counter(neg_df['Sentiment'])\n",
    "neg_df\n",
    "negativeness_score_01_0 = get_dividing_vec_og(neg_df.copy(), 1) # splits into 4, 0123\n",
    "neg_df['lda_score_01_0'] = MinMaxScaler().fit_transform(negativeness_score_01_0.reshape(-1,1))\n",
    "neg_df['lda_predict_01_0'] = neg_df.apply(lambda x: (x['lda_score_01_0'] >.5), axis=1)\n",
    "# neg_df\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({2: 305, 0: 179, 3: 208, 1: 430, 4: 80})"
      ]
     },
     "execution_count": 70,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "columns = ['Sentiment','lda_predict_01_0']\n",
    "view_df = pd.DataFrame(neg_df, columns = columns)\n",
    "view_df\n",
    "Counter(neg_df['lda_predict_01_0'])\n",
    "Counter(neg_df['Sentiment'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = ['Sentiment','lda_predict_01','lda_predict_34','lda_predict_0', 'lda_predict_1','lda_predict_2','lda_predict_3','lda_predict_4']\n",
    "view_df = pd.DataFrame(df, columns = columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>lda_predict_01</th>\n",
       "      <th>lda_predict_34</th>\n",
       "      <th>lda_predict_0</th>\n",
       "      <th>lda_predict_1</th>\n",
       "      <th>lda_predict_2</th>\n",
       "      <th>lda_predict_3</th>\n",
       "      <th>lda_predict_4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>5192</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>99335</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>144855</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>82075</td>\n",
       "      <td>4</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>82568</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>86351</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28644</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28410</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20683</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>146190</td>\n",
       "      <td>4</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>985 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Sentiment  lda_predict_01  lda_predict_34  lda_predict_0  \\\n",
       "5192            4            True            True           True   \n",
       "99335           4            True            True           True   \n",
       "144855          4            True            True           True   \n",
       "82075           4           False           False          False   \n",
       "82568           4            True            True           True   \n",
       "...           ...             ...             ...            ...   \n",
       "86351           4            True            True           True   \n",
       "28644           4            True            True           True   \n",
       "28410           4            True            True           True   \n",
       "20683           4            True            True           True   \n",
       "146190          4            True            True           True   \n",
       "\n",
       "        lda_predict_1  lda_predict_2  lda_predict_3  lda_predict_4  \n",
       "5192             True          False           True           True  \n",
       "99335            True          False           True           True  \n",
       "144855           True          False           True           True  \n",
       "82075           False          False           True           True  \n",
       "82568            True          False           True           True  \n",
       "...               ...            ...            ...            ...  \n",
       "86351            True          False          False           True  \n",
       "28644            True          False           True           True  \n",
       "28410            True          False           True           True  \n",
       "20683            True          False           True           True  \n",
       "146190           True          False           True           True  \n",
       "\n",
       "[985 rows x 8 columns]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "view_df[view_df['Sentiment'] == 4]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>lda_predict_01</th>\n",
       "      <th>lda_predict_34</th>\n",
       "      <th>lda_predict_0</th>\n",
       "      <th>lda_predict_1</th>\n",
       "      <th>lda_predict_2</th>\n",
       "      <th>lda_predict_3</th>\n",
       "      <th>lda_predict_4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>128037</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>21758</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>51668</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>19946</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>103068</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>115347</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>139003</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>116843</td>\n",
       "      <td>3</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>87571</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5064</td>\n",
       "      <td>3</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "      <td>False</td>\n",
       "      <td>True</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>3239 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Sentiment  lda_predict_01  lda_predict_34  lda_predict_0  \\\n",
       "128037          3            True            True           True   \n",
       "21758           3            True            True           True   \n",
       "51668           3            True            True           True   \n",
       "19946           3            True            True           True   \n",
       "103068          3            True            True           True   \n",
       "...           ...             ...             ...            ...   \n",
       "115347          3            True            True           True   \n",
       "139003          3            True            True           True   \n",
       "116843          3           False           False          False   \n",
       "87571           3            True            True           True   \n",
       "5064            3            True            True           True   \n",
       "\n",
       "        lda_predict_1  lda_predict_2  lda_predict_3  lda_predict_4  \n",
       "128037           True          False           True           True  \n",
       "21758            True          False           True           True  \n",
       "51668            True          False           True           True  \n",
       "19946            True          False           True           True  \n",
       "103068           True          False           True           True  \n",
       "...               ...            ...            ...            ...  \n",
       "115347           True          False          False           True  \n",
       "139003           True          False           True           True  \n",
       "116843           True           True          False          False  \n",
       "87571            True          False           True           True  \n",
       "5064             True          False           True           True  \n",
       "\n",
       "[3239 rows x 8 columns]"
      ]
     },
     "execution_count": 52,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "view_df[view_df['Sentiment'] == 3]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>lda_score_01</th>\n",
       "      <th>lda_predict_01</th>\n",
       "      <th>lda_score_34</th>\n",
       "      <th>lda_predict_34</th>\n",
       "      <th>PoN</th>\n",
       "      <th>lda_score_0</th>\n",
       "      <th>lda_predict_0</th>\n",
       "      <th>lda_score_1</th>\n",
       "      <th>lda_predict_1</th>\n",
       "      <th>lda_score_2</th>\n",
       "      <th>lda_predict_2</th>\n",
       "      <th>lda_score_3</th>\n",
       "      <th>lda_predict_3</th>\n",
       "      <th>lda_score_4</th>\n",
       "      <th>lda_predict_4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>128037</td>\n",
       "      <td>128037</td>\n",
       "      <td>128038</td>\n",
       "      <td>6887</td>\n",
       "      <td>as the main character suggests , ` what if</td>\n",
       "      <td>3</td>\n",
       "      <td>0.625832</td>\n",
       "      <td>True</td>\n",
       "      <td>0.625832</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>0.682450</td>\n",
       "      <td>True</td>\n",
       "      <td>0.588361</td>\n",
       "      <td>True</td>\n",
       "      <td>0.230660</td>\n",
       "      <td>False</td>\n",
       "      <td>0.658924</td>\n",
       "      <td>True</td>\n",
       "      <td>0.825389</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5192</td>\n",
       "      <td>5192</td>\n",
       "      <td>5193</td>\n",
       "      <td>206</td>\n",
       "      <td>well-wrought story</td>\n",
       "      <td>4</td>\n",
       "      <td>0.870760</td>\n",
       "      <td>True</td>\n",
       "      <td>0.870760</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>0.950018</td>\n",
       "      <td>True</td>\n",
       "      <td>0.807329</td>\n",
       "      <td>True</td>\n",
       "      <td>0.074725</td>\n",
       "      <td>False</td>\n",
       "      <td>0.743194</td>\n",
       "      <td>True</td>\n",
       "      <td>0.833822</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>50057</td>\n",
       "      <td>50057</td>\n",
       "      <td>50058</td>\n",
       "      <td>2457</td>\n",
       "      <td>pack raw dough</td>\n",
       "      <td>2</td>\n",
       "      <td>0.858039</td>\n",
       "      <td>True</td>\n",
       "      <td>0.858039</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>0.859539</td>\n",
       "      <td>True</td>\n",
       "      <td>0.848161</td>\n",
       "      <td>True</td>\n",
       "      <td>0.052835</td>\n",
       "      <td>False</td>\n",
       "      <td>0.760960</td>\n",
       "      <td>True</td>\n",
       "      <td>0.892131</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>109259</td>\n",
       "      <td>109259</td>\n",
       "      <td>109260</td>\n",
       "      <td>5785</td>\n",
       "      <td>into the editing room</td>\n",
       "      <td>2</td>\n",
       "      <td>0.769221</td>\n",
       "      <td>True</td>\n",
       "      <td>0.769221</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>0.865337</td>\n",
       "      <td>True</td>\n",
       "      <td>0.698661</td>\n",
       "      <td>True</td>\n",
       "      <td>0.108810</td>\n",
       "      <td>False</td>\n",
       "      <td>0.738192</td>\n",
       "      <td>True</td>\n",
       "      <td>0.880093</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>73349</td>\n",
       "      <td>73349</td>\n",
       "      <td>73350</td>\n",
       "      <td>3748</td>\n",
       "      <td>concerned with morality</td>\n",
       "      <td>2</td>\n",
       "      <td>0.863717</td>\n",
       "      <td>True</td>\n",
       "      <td>0.863717</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>0.925024</td>\n",
       "      <td>True</td>\n",
       "      <td>0.812825</td>\n",
       "      <td>True</td>\n",
       "      <td>0.096071</td>\n",
       "      <td>False</td>\n",
       "      <td>0.697099</td>\n",
       "      <td>True</td>\n",
       "      <td>0.843338</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25979</td>\n",
       "      <td>25979</td>\n",
       "      <td>25980</td>\n",
       "      <td>1189</td>\n",
       "      <td>Spy</td>\n",
       "      <td>2</td>\n",
       "      <td>0.882761</td>\n",
       "      <td>True</td>\n",
       "      <td>0.882761</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>0.925578</td>\n",
       "      <td>True</td>\n",
       "      <td>0.843655</td>\n",
       "      <td>True</td>\n",
       "      <td>0.035800</td>\n",
       "      <td>False</td>\n",
       "      <td>0.783875</td>\n",
       "      <td>True</td>\n",
       "      <td>0.877085</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28724</td>\n",
       "      <td>28724</td>\n",
       "      <td>28725</td>\n",
       "      <td>1331</td>\n",
       "      <td>semi-autobiographical film</td>\n",
       "      <td>2</td>\n",
       "      <td>0.873131</td>\n",
       "      <td>True</td>\n",
       "      <td>0.873131</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>0.898261</td>\n",
       "      <td>True</td>\n",
       "      <td>0.846496</td>\n",
       "      <td>True</td>\n",
       "      <td>0.092741</td>\n",
       "      <td>False</td>\n",
       "      <td>0.725815</td>\n",
       "      <td>True</td>\n",
       "      <td>0.800833</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5064</td>\n",
       "      <td>5064</td>\n",
       "      <td>5065</td>\n",
       "      <td>198</td>\n",
       "      <td>that writer and director Burr Steers knows the...</td>\n",
       "      <td>3</td>\n",
       "      <td>0.803085</td>\n",
       "      <td>True</td>\n",
       "      <td>0.803085</td>\n",
       "      <td>True</td>\n",
       "      <td>1</td>\n",
       "      <td>0.774411</td>\n",
       "      <td>True</td>\n",
       "      <td>0.816138</td>\n",
       "      <td>True</td>\n",
       "      <td>0.203446</td>\n",
       "      <td>False</td>\n",
       "      <td>0.607434</td>\n",
       "      <td>True</td>\n",
       "      <td>0.731231</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>85856</td>\n",
       "      <td>85856</td>\n",
       "      <td>85857</td>\n",
       "      <td>4443</td>\n",
       "      <td>associations you choose to make</td>\n",
       "      <td>2</td>\n",
       "      <td>0.738841</td>\n",
       "      <td>True</td>\n",
       "      <td>0.738841</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>0.797750</td>\n",
       "      <td>True</td>\n",
       "      <td>0.694952</td>\n",
       "      <td>True</td>\n",
       "      <td>0.124631</td>\n",
       "      <td>False</td>\n",
       "      <td>0.731624</td>\n",
       "      <td>True</td>\n",
       "      <td>0.884184</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>141693</td>\n",
       "      <td>141693</td>\n",
       "      <td>141694</td>\n",
       "      <td>7686</td>\n",
       "      <td>a human volcano or</td>\n",
       "      <td>2</td>\n",
       "      <td>0.762257</td>\n",
       "      <td>True</td>\n",
       "      <td>0.762257</td>\n",
       "      <td>True</td>\n",
       "      <td>0</td>\n",
       "      <td>0.790570</td>\n",
       "      <td>True</td>\n",
       "      <td>0.738217</td>\n",
       "      <td>True</td>\n",
       "      <td>0.169487</td>\n",
       "      <td>False</td>\n",
       "      <td>0.672788</td>\n",
       "      <td>True</td>\n",
       "      <td>0.799116</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>15606 rows × 20 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0  PhraseId  SentenceId  \\\n",
       "128037      128037    128038        6887   \n",
       "5192          5192      5193         206   \n",
       "50057        50057     50058        2457   \n",
       "109259      109259    109260        5785   \n",
       "73349        73349     73350        3748   \n",
       "...            ...       ...         ...   \n",
       "25979        25979     25980        1189   \n",
       "28724        28724     28725        1331   \n",
       "5064          5064      5065         198   \n",
       "85856        85856     85857        4443   \n",
       "141693      141693    141694        7686   \n",
       "\n",
       "                                                   Phrase  Sentiment  \\\n",
       "128037         as the main character suggests , ` what if          3   \n",
       "5192                                   well-wrought story          4   \n",
       "50057                                      pack raw dough          2   \n",
       "109259                              into the editing room          2   \n",
       "73349                             concerned with morality          2   \n",
       "...                                                   ...        ...   \n",
       "25979                                                 Spy          2   \n",
       "28724                          semi-autobiographical film          2   \n",
       "5064    that writer and director Burr Steers knows the...          3   \n",
       "85856                     associations you choose to make          2   \n",
       "141693                                 a human volcano or          2   \n",
       "\n",
       "        lda_score_01  lda_predict_01  lda_score_34  lda_predict_34  PoN  \\\n",
       "128037      0.625832            True      0.625832            True    1   \n",
       "5192        0.870760            True      0.870760            True    1   \n",
       "50057       0.858039            True      0.858039            True    0   \n",
       "109259      0.769221            True      0.769221            True    0   \n",
       "73349       0.863717            True      0.863717            True    0   \n",
       "...              ...             ...           ...             ...  ...   \n",
       "25979       0.882761            True      0.882761            True    0   \n",
       "28724       0.873131            True      0.873131            True    0   \n",
       "5064        0.803085            True      0.803085            True    1   \n",
       "85856       0.738841            True      0.738841            True    0   \n",
       "141693      0.762257            True      0.762257            True    0   \n",
       "\n",
       "        lda_score_0  lda_predict_0  lda_score_1  lda_predict_1  lda_score_2  \\\n",
       "128037     0.682450           True     0.588361           True     0.230660   \n",
       "5192       0.950018           True     0.807329           True     0.074725   \n",
       "50057      0.859539           True     0.848161           True     0.052835   \n",
       "109259     0.865337           True     0.698661           True     0.108810   \n",
       "73349      0.925024           True     0.812825           True     0.096071   \n",
       "...             ...            ...          ...            ...          ...   \n",
       "25979      0.925578           True     0.843655           True     0.035800   \n",
       "28724      0.898261           True     0.846496           True     0.092741   \n",
       "5064       0.774411           True     0.816138           True     0.203446   \n",
       "85856      0.797750           True     0.694952           True     0.124631   \n",
       "141693     0.790570           True     0.738217           True     0.169487   \n",
       "\n",
       "        lda_predict_2  lda_score_3  lda_predict_3  lda_score_4  lda_predict_4  \n",
       "128037          False     0.658924           True     0.825389           True  \n",
       "5192            False     0.743194           True     0.833822           True  \n",
       "50057           False     0.760960           True     0.892131           True  \n",
       "109259          False     0.738192           True     0.880093           True  \n",
       "73349           False     0.697099           True     0.843338           True  \n",
       "...               ...          ...            ...          ...            ...  \n",
       "25979           False     0.783875           True     0.877085           True  \n",
       "28724           False     0.725815           True     0.800833           True  \n",
       "5064            False     0.607434           True     0.731231           True  \n",
       "85856           False     0.731624           True     0.884184           True  \n",
       "141693          False     0.672788           True     0.799116           True  \n",
       "\n",
       "[15606 rows x 20 columns]"
      ]
     },
     "execution_count": 71,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [],
   "source": [
    "columns = ['Sentiment','lda_score_0', 'lda_score_1','lda_score_2','lda_score_3','lda_score_4']\n",
    "view_df = pd.DataFrame(df, columns = columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>lda_score_0</th>\n",
       "      <th>lda_score_1</th>\n",
       "      <th>lda_score_2</th>\n",
       "      <th>lda_score_3</th>\n",
       "      <th>lda_score_4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>128037</td>\n",
       "      <td>3</td>\n",
       "      <td>0.682450</td>\n",
       "      <td>0.588361</td>\n",
       "      <td>0.230660</td>\n",
       "      <td>0.658924</td>\n",
       "      <td>0.825389</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5192</td>\n",
       "      <td>4</td>\n",
       "      <td>0.950018</td>\n",
       "      <td>0.807329</td>\n",
       "      <td>0.074725</td>\n",
       "      <td>0.743194</td>\n",
       "      <td>0.833822</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>50057</td>\n",
       "      <td>2</td>\n",
       "      <td>0.859539</td>\n",
       "      <td>0.848161</td>\n",
       "      <td>0.052835</td>\n",
       "      <td>0.760960</td>\n",
       "      <td>0.892131</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>109259</td>\n",
       "      <td>2</td>\n",
       "      <td>0.865337</td>\n",
       "      <td>0.698661</td>\n",
       "      <td>0.108810</td>\n",
       "      <td>0.738192</td>\n",
       "      <td>0.880093</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>73349</td>\n",
       "      <td>2</td>\n",
       "      <td>0.925024</td>\n",
       "      <td>0.812825</td>\n",
       "      <td>0.096071</td>\n",
       "      <td>0.697099</td>\n",
       "      <td>0.843338</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25979</td>\n",
       "      <td>2</td>\n",
       "      <td>0.925578</td>\n",
       "      <td>0.843655</td>\n",
       "      <td>0.035800</td>\n",
       "      <td>0.783875</td>\n",
       "      <td>0.877085</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28724</td>\n",
       "      <td>2</td>\n",
       "      <td>0.898261</td>\n",
       "      <td>0.846496</td>\n",
       "      <td>0.092741</td>\n",
       "      <td>0.725815</td>\n",
       "      <td>0.800833</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5064</td>\n",
       "      <td>3</td>\n",
       "      <td>0.774411</td>\n",
       "      <td>0.816138</td>\n",
       "      <td>0.203446</td>\n",
       "      <td>0.607434</td>\n",
       "      <td>0.731231</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>85856</td>\n",
       "      <td>2</td>\n",
       "      <td>0.797750</td>\n",
       "      <td>0.694952</td>\n",
       "      <td>0.124631</td>\n",
       "      <td>0.731624</td>\n",
       "      <td>0.884184</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>141693</td>\n",
       "      <td>2</td>\n",
       "      <td>0.790570</td>\n",
       "      <td>0.738217</td>\n",
       "      <td>0.169487</td>\n",
       "      <td>0.672788</td>\n",
       "      <td>0.799116</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>15606 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Sentiment  lda_score_0  lda_score_1  lda_score_2  lda_score_3  \\\n",
       "128037          3     0.682450     0.588361     0.230660     0.658924   \n",
       "5192            4     0.950018     0.807329     0.074725     0.743194   \n",
       "50057           2     0.859539     0.848161     0.052835     0.760960   \n",
       "109259          2     0.865337     0.698661     0.108810     0.738192   \n",
       "73349           2     0.925024     0.812825     0.096071     0.697099   \n",
       "...           ...          ...          ...          ...          ...   \n",
       "25979           2     0.925578     0.843655     0.035800     0.783875   \n",
       "28724           2     0.898261     0.846496     0.092741     0.725815   \n",
       "5064            3     0.774411     0.816138     0.203446     0.607434   \n",
       "85856           2     0.797750     0.694952     0.124631     0.731624   \n",
       "141693          2     0.790570     0.738217     0.169487     0.672788   \n",
       "\n",
       "        lda_score_4  \n",
       "128037     0.825389  \n",
       "5192       0.833822  \n",
       "50057      0.892131  \n",
       "109259     0.880093  \n",
       "73349      0.843338  \n",
       "...             ...  \n",
       "25979      0.877085  \n",
       "28724      0.800833  \n",
       "5064       0.731231  \n",
       "85856      0.884184  \n",
       "141693     0.799116  \n",
       "\n",
       "[15606 rows x 6 columns]"
      ]
     },
     "execution_count": 75,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "view_df['avg_0'] = "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.769356026955543\n",
      "0.701779683155207\n",
      "0.18353138059679489\n",
      "0.6827725447936251\n",
      "0.786741026622421\n"
     ]
    }
   ],
   "source": [
    "print(view_df['lda_score_0'].sum()/len(view_df))\n",
    "print(view_df['lda_score_1'].sum()/len(view_df))\n",
    "print(view_df['lda_score_2'].sum()/len(view_df))\n",
    "print(view_df['lda_score_3'].sum()/len(view_df))\n",
    "print(view_df['lda_score_4'].sum()/len(view_df))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 83,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>lda_score_0</th>\n",
       "      <th>lda_score_1</th>\n",
       "      <th>lda_score_2</th>\n",
       "      <th>lda_score_3</th>\n",
       "      <th>lda_score_4</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>33878</td>\n",
       "      <td>0</td>\n",
       "      <td>0.464875</td>\n",
       "      <td>0.418104</td>\n",
       "      <td>0.410717</td>\n",
       "      <td>0.582924</td>\n",
       "      <td>0.663965</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>76568</td>\n",
       "      <td>0</td>\n",
       "      <td>0.514842</td>\n",
       "      <td>0.392097</td>\n",
       "      <td>0.398716</td>\n",
       "      <td>0.586173</td>\n",
       "      <td>0.688356</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>95529</td>\n",
       "      <td>0</td>\n",
       "      <td>0.657040</td>\n",
       "      <td>0.685026</td>\n",
       "      <td>0.281238</td>\n",
       "      <td>0.607644</td>\n",
       "      <td>0.678328</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1046</td>\n",
       "      <td>0</td>\n",
       "      <td>0.851553</td>\n",
       "      <td>0.777606</td>\n",
       "      <td>0.065881</td>\n",
       "      <td>0.776598</td>\n",
       "      <td>0.896196</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>101732</td>\n",
       "      <td>0</td>\n",
       "      <td>0.618032</td>\n",
       "      <td>0.793915</td>\n",
       "      <td>0.086736</td>\n",
       "      <td>0.810066</td>\n",
       "      <td>0.909167</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>106560</td>\n",
       "      <td>0</td>\n",
       "      <td>0.744318</td>\n",
       "      <td>0.622385</td>\n",
       "      <td>0.192315</td>\n",
       "      <td>0.694039</td>\n",
       "      <td>0.826760</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>33469</td>\n",
       "      <td>0</td>\n",
       "      <td>0.460790</td>\n",
       "      <td>0.549610</td>\n",
       "      <td>0.378017</td>\n",
       "      <td>0.611169</td>\n",
       "      <td>0.613575</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>149358</td>\n",
       "      <td>0</td>\n",
       "      <td>0.653788</td>\n",
       "      <td>0.584912</td>\n",
       "      <td>0.265159</td>\n",
       "      <td>0.658602</td>\n",
       "      <td>0.742774</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25024</td>\n",
       "      <td>0</td>\n",
       "      <td>0.610724</td>\n",
       "      <td>0.636797</td>\n",
       "      <td>0.272931</td>\n",
       "      <td>0.638486</td>\n",
       "      <td>0.728207</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>76265</td>\n",
       "      <td>0</td>\n",
       "      <td>0.511089</td>\n",
       "      <td>0.548240</td>\n",
       "      <td>0.278886</td>\n",
       "      <td>0.701541</td>\n",
       "      <td>0.756394</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>691 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Sentiment  lda_score_0  lda_score_1  lda_score_2  lda_score_3  \\\n",
       "33878           0     0.464875     0.418104     0.410717     0.582924   \n",
       "76568           0     0.514842     0.392097     0.398716     0.586173   \n",
       "95529           0     0.657040     0.685026     0.281238     0.607644   \n",
       "1046            0     0.851553     0.777606     0.065881     0.776598   \n",
       "101732          0     0.618032     0.793915     0.086736     0.810066   \n",
       "...           ...          ...          ...          ...          ...   \n",
       "106560          0     0.744318     0.622385     0.192315     0.694039   \n",
       "33469           0     0.460790     0.549610     0.378017     0.611169   \n",
       "149358          0     0.653788     0.584912     0.265159     0.658602   \n",
       "25024           0     0.610724     0.636797     0.272931     0.638486   \n",
       "76265           0     0.511089     0.548240     0.278886     0.701541   \n",
       "\n",
       "        lda_score_4  \n",
       "33878      0.663965  \n",
       "76568      0.688356  \n",
       "95529      0.678328  \n",
       "1046       0.896196  \n",
       "101732     0.909167  \n",
       "...             ...  \n",
       "106560     0.826760  \n",
       "33469      0.613575  \n",
       "149358     0.742774  \n",
       "25024      0.728207  \n",
       "76265      0.756394  \n",
       "\n",
       "[691 rows x 6 columns]"
      ]
     },
     "execution_count": 83,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "view_df[view_df['Sentiment'] == 0 ]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [],
   "source": [
    "# def split_into_neg_pos(df):\n",
    "#     # USE CENTEROID "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 86,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_lda(df):\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    mask = df.PoN.astype(bool).values\n",
    "    n_centroid = tfidf_docs[mask].mean(axis=0)\n",
    "    p_centroid = tfidf_docs[~mask].mean(axis=0)\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))\n",
    "    df['lda_predict'] = (df.lda_score > .5).astype(int)\n",
    "    return (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3), df, negativity_score, n_centroid, p_centroid"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 87,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "train=pd.read_csv(\"../WK7/kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values\n",
    "train.to_csv('kaggle_csv.csv')\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['Phrase'] = train['Phrase']\n",
    "df['S0'] = train['Sentiment']\n",
    "\n",
    "df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]\n",
    "len(df[df['PoN'] == 1])\n",
    "lda_score, df, neg_score, n_cent, p_cent = get_lda(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 92,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_lda_submission(df, n_centroid, p_centroid):\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))\n",
    "    df['lda_predict'] = (df.lda_score > .5).astype(int)\n",
    "    return df\n",
    "\n",
    "columns = ['Phrase']\n",
    "new_df =  pd.DataFrame(df, columns = columns)\n",
    "# new_df\n",
    "predicted_df = get_lda_submission(new_df, n_cent, p_cent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 97,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({1: 152309, 0: 3751})"
      ]
     },
     "execution_count": 97,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predicted_df['actual'] = df['S0']\n",
    "Counter(predicted_df['lda_predict'] )"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Phrase         670\n",
       "lda_score      670\n",
       "lda_predict    670\n",
       "actual         670\n",
       "dtype: int64"
      ]
     },
     "execution_count": 101,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predicted_df[(predicted_df.actual == 0) & (predicted_df.lda_predict == 0)].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Phrase         1218\n",
       "lda_score      1218\n",
       "lda_predict    1218\n",
       "actual         1218\n",
       "dtype: int64"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predicted_df[(predicted_df.actual == 1) & (predicted_df.lda_predict == 0)].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Phrase         1863\n",
       "lda_score      1863\n",
       "lda_predict    1863\n",
       "actual         1863\n",
       "dtype: int64"
      ]
     },
     "execution_count": 103,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predicted_df[(predicted_df.actual > 1) & (predicted_df.lda_predict == 0)].count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_df['bool'] = [0 if x < 2 else 1 for x in predicted_df['actual']]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 108,
   "metadata": {},
   "outputs": [],
   "source": [
    "predicted_df['check'] = predicted_df.apply(lambda x: (x['lda_predict'] == x['bool']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 109,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Phrase</th>\n",
       "      <th>lda_score</th>\n",
       "      <th>lda_predict</th>\n",
       "      <th>actual</th>\n",
       "      <th>bool</th>\n",
       "      <th>check</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>0.531051</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>0.712447</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>A series</td>\n",
       "      <td>0.780069</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>A</td>\n",
       "      <td>0.577151</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>series</td>\n",
       "      <td>0.893099</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156055</td>\n",
       "      <td>Hearst 's</td>\n",
       "      <td>0.858866</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156056</td>\n",
       "      <td>forced avuncular chortles</td>\n",
       "      <td>0.891445</td>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>False</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156057</td>\n",
       "      <td>avuncular chortles</td>\n",
       "      <td>0.905823</td>\n",
       "      <td>1</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156058</td>\n",
       "      <td>avuncular</td>\n",
       "      <td>0.905623</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156059</td>\n",
       "      <td>chortles</td>\n",
       "      <td>0.905623</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>1</td>\n",
       "      <td>True</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>156060 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   Phrase  lda_score  \\\n",
       "0       A series of escapades demonstrating the adage ...   0.531051   \n",
       "1       A series of escapades demonstrating the adage ...   0.712447   \n",
       "2                                                A series   0.780069   \n",
       "3                                                       A   0.577151   \n",
       "4                                                  series   0.893099   \n",
       "...                                                   ...        ...   \n",
       "156055                                          Hearst 's   0.858866   \n",
       "156056                          forced avuncular chortles   0.891445   \n",
       "156057                                 avuncular chortles   0.905823   \n",
       "156058                                          avuncular   0.905623   \n",
       "156059                                           chortles   0.905623   \n",
       "\n",
       "        lda_predict  actual  bool  check  \n",
       "0                 1       1     0  False  \n",
       "1                 1       2     1   True  \n",
       "2                 1       2     1   True  \n",
       "3                 1       2     1   True  \n",
       "4                 1       2     1   True  \n",
       "...             ...     ...   ...    ...  \n",
       "156055            1       2     1   True  \n",
       "156056            1       1     0  False  \n",
       "156057            1       3     1   True  \n",
       "156058            1       2     1   True  \n",
       "156059            1       2     1   True  \n",
       "\n",
       "[156060 rows x 6 columns]"
      ]
     },
     "execution_count": 109,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "predicted_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 110,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({False: 34320, True: 121740})"
      ]
     },
     "execution_count": 110,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Counter(predicted_df['check'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "156060"
      ]
     },
     "execution_count": 111,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.780084582852749"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "121740/len(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 114,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({False: 34320, True: 121740})"
      ]
     },
     "execution_count": 114,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "train=pd.read_csv(\"../WK7/kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values\n",
    "train.to_csv('kaggle_csv.csv')\n",
    "\n",
    "df = pd.DataFrame()\n",
    "df['Phrase'] = train['Phrase']\n",
    "df['S0'] = train['Sentiment']\n",
    "\n",
    "df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]\n",
    "len(df[df['PoN'] == 1])\n",
    "lda_score, df, neg_score, n_cent, p_cent = get_lda(df)\n",
    "\n",
    "\n",
    "def get_lda(df):\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    mask = df.PoN.astype(bool).values\n",
    "    n_centroid = tfidf_docs[mask].mean(axis=0)\n",
    "    p_centroid = tfidf_docs[~mask].mean(axis=0)\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))\n",
    "    df['lda_predict'] = (df.lda_score > .5).astype(int)\n",
    "    score = (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3) \n",
    "    return score, df, negativity_score, n_centroid, p_centroid\n",
    "\n",
    "def get_lda_submission(df, n_centroid, p_centroid):\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))\n",
    "    df['lda_predict'] = (df.lda_score > .5).astype(int)\n",
    "    return df\n",
    "\n",
    "columns = ['Phrase']\n",
    "new_df =  pd.DataFrame(df, columns = columns)\n",
    "predicted_df = get_lda_submission(new_df, n_cent, p_cent)\n",
    "\n",
    "predicted_df['actual'] = df['S0']\n",
    "predicted_df['bool'] = [0 if x < 2 else 1 for x in predicted_df['actual']]\n",
    "predicted_df['check'] = predicted_df.apply(lambda x: (x['lda_predict'] == x['bool']), axis=1)\n",
    "\n",
    "Counter(predicted_df['check'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
