{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# HW8 methods on HW7 assignment"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## MOTIVATION\n",
    "\n",
    "    In our endless quest to get a better accuracy for the kaggle competition, we're going to try applying LDA to the dataset (just for funsies!). "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## METHODS OVERVIEW\n",
    "    Because LDA relies on booleans, we're first going to separate the data into 0 not0, 1, not1, 2, not2 etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [],
   "source": [
    "## =======================================================\n",
    "## IMPORTING\n",
    "## =======================================================\n",
    "import pandas as pd\n",
    "train=pd.read_csv(\"../WK7/kaggle-sentiment/train.tsv\", delimiter='\\t')\n",
    "y=train['Sentiment'].values\n",
    "X=train['Phrase'].values\n",
    "train.to_csv('kaggle_csv.csv')\n",
    "\n",
    "## =======================================================\n",
    "## LDA\n",
    "## =======================================================\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from nltk.tokenize.casual import casual_tokenize\n",
    "tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)\n",
    "from sklearn.preprocessing import MinMaxScaler\n",
    "\n",
    "def get_lda(df):\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    mask = df.PoN.astype(bool).values\n",
    "    n_centroid = tfidf_docs[mask].mean(axis=0)\n",
    "    p_centroid = tfidf_docs[~mask].mean(axis=0)\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))\n",
    "    df['lda_predict'] = (df.lda_score > .5).astype(int)\n",
    "    return (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3), df, negativity_score, n_centroid, p_centroid\n",
    "\n",
    "def get_centroids(df):\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    mask = df.PoN.astype(bool).values\n",
    "    n_centroid = tfidf_docs[mask].mean(axis=0)\n",
    "    p_centroid = tfidf_docs[~mask].mean(axis=0)\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    return n_centroid, p_centroid, negativity_score\n",
    "    \n",
    "def get_lda_submission(df, n_centroid, p_centroid):\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))\n",
    "    df['lda_predict'] = (df.lda_score > .5).astype(int)\n",
    "    return df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 1: Splitting the data into NEG (0,1) and POS (2,3,4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.78 [-7.65301422e-03 -3.94205354e-03 -2.55865579e-03 ...  1.39818841e-05\n",
      "  9.88668503e-06  9.88668503e-06]\n",
      "[7.10273499e-04 7.44736360e-05 1.72212536e-04 ... 4.64599879e-05\n",
      " 4.42011525e-05 0.00000000e+00]\n",
      "[4.17019741e-04 1.99202480e-05 3.04331371e-04 ... 0.00000000e+00\n",
      " 0.00000000e+00 1.11766109e-04]\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "16394"
      ]
     },
     "execution_count": 32,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df = pd.DataFrame()\n",
    "df['Phrase'] = train['Phrase']\n",
    "df['S0'] = train['Sentiment']\n",
    "\n",
    "df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]\n",
    "len(df[df['PoN'] == 1])\n",
    "lda_score, df, neg_score, n_cent, p_cent = get_lda(df)\n",
    "print(lda_score, neg_score)\n",
    "print(n_cent)\n",
    "print(p_cent)\n",
    "len(n_cent)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "156060"
      ]
     },
     "execution_count": 35,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(n_cent)\n",
    "len(p_cent)\n",
    "len(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 2: Splitting the data into NEG (0,1,2) and POS (3,4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.736\n"
     ]
    }
   ],
   "source": [
    "df = pd.DataFrame()\n",
    "df['Phrase'] = train['Phrase']\n",
    "df['S0'] = train['Sentiment']\n",
    "\n",
    "df['PoN'] = [0 if x < 3 else 1 for x in df['S0']]\n",
    "len(df[df['PoN'] == 1])\n",
    "lda_score, df = get_lda(df)\n",
    "print(lda_score)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 3: Splitting the data into NEG(0,1) and POS (3,4) (Removing Neutral)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.649\n"
     ]
    }
   ],
   "source": [
    "df = pd.DataFrame()\n",
    "df['Phrase'] = train['Phrase']\n",
    "df['S0'] = train['Sentiment']\n",
    "\n",
    "df = df.drop(df[df['S0'] == 2].index)\n",
    "df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]\n",
    "len(df[df['PoN'] == 1])\n",
    "lda_score, df = get_lda(df)\n",
    "print(lda_score)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 4A: Taking the \"negatives\" from Test 3 and running LDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Phrase</th>\n",
       "      <th>S0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>lda_score</th>\n",
       "      <th>lda_predict</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>94</td>\n",
       "      <td>, I suspect , would have a hard time sitting t...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.497936</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>100</td>\n",
       "      <td>would have a hard time sitting through this one .</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.447842</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>101</td>\n",
       "      <td>would have a hard time sitting through this one</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.460341</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>103</td>\n",
       "      <td>have a hard time sitting through this one</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0.488530</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>110</td>\n",
       "      <td>sitting through this one</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.495408</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156031</td>\n",
       "      <td>The movie 's downfall is to substitute plot fo...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.413219</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156032</td>\n",
       "      <td>The movie 's downfall</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.496717</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156033</td>\n",
       "      <td>is to substitute plot for personality .</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.424279</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156034</td>\n",
       "      <td>is to substitute plot for personality</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.437314</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156036</td>\n",
       "      <td>substitute plot for personality</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.496285</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>13679 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   Phrase  S0  PoN  lda_score  \\\n",
       "94      , I suspect , would have a hard time sitting t...   1    0   0.497936   \n",
       "100     would have a hard time sitting through this one .   1    0   0.447842   \n",
       "101       would have a hard time sitting through this one   0    0   0.460341   \n",
       "103             have a hard time sitting through this one   0    0   0.488530   \n",
       "110                              sitting through this one   1    0   0.495408   \n",
       "...                                                   ...  ..  ...        ...   \n",
       "156031  The movie 's downfall is to substitute plot fo...   1    0   0.413219   \n",
       "156032                              The movie 's downfall   1    0   0.496717   \n",
       "156033            is to substitute plot for personality .   1    0   0.424279   \n",
       "156034              is to substitute plot for personality   1    0   0.437314   \n",
       "156036                    substitute plot for personality   1    0   0.496285   \n",
       "\n",
       "        lda_predict  \n",
       "94                0  \n",
       "100               0  \n",
       "101               0  \n",
       "103               0  \n",
       "110               0  \n",
       "...             ...  \n",
       "156031            0  \n",
       "156032            0  \n",
       "156033            0  \n",
       "156034            0  \n",
       "156036            0  \n",
       "\n",
       "[13679 rows x 5 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_neg = df[df['lda_predict'] == 0].copy()\n",
    "df_neg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.807\n"
     ]
    }
   ],
   "source": [
    "df_neg['PoN'] = [0 if x == 0 else 1 for x in df_neg['S0']]\n",
    "lda_score, n_df = get_lda(df_neg)\n",
    "print(lda_score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Phrase</th>\n",
       "      <th>S0</th>\n",
       "      <th>PoN</th>\n",
       "      <th>lda_score</th>\n",
       "      <th>lda_predict</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>A series of escapades demonstrating the adage ...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.628765</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>21</td>\n",
       "      <td>good for the goose</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0.668112</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>22</td>\n",
       "      <td>good</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0.837784</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>33</td>\n",
       "      <td>the gander , some of which occasionally amuses...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.572662</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>46</td>\n",
       "      <td>amuses</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0.562098</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156047</td>\n",
       "      <td>quietly suggesting the sadness and obsession b...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.586494</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156051</td>\n",
       "      <td>sadness and obsession</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.639210</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156052</td>\n",
       "      <td>sadness and</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.677912</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156056</td>\n",
       "      <td>forced avuncular chortles</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0.547883</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>156057</td>\n",
       "      <td>avuncular chortles</td>\n",
       "      <td>3</td>\n",
       "      <td>1</td>\n",
       "      <td>0.561126</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>76478 rows × 5 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                   Phrase  S0  PoN  lda_score  \\\n",
       "0       A series of escapades demonstrating the adage ...   1    0   0.628765   \n",
       "21                                     good for the goose   3    1   0.668112   \n",
       "22                                                   good   3    1   0.837784   \n",
       "33      the gander , some of which occasionally amuses...   1    0   0.572662   \n",
       "46                                                 amuses   3    1   0.562098   \n",
       "...                                                   ...  ..  ...        ...   \n",
       "156047  quietly suggesting the sadness and obsession b...   1    0   0.586494   \n",
       "156051                              sadness and obsession   1    0   0.639210   \n",
       "156052                                        sadness and   1    0   0.677912   \n",
       "156056                          forced avuncular chortles   1    0   0.547883   \n",
       "156057                                 avuncular chortles   3    1   0.561126   \n",
       "\n",
       "        lda_predict  \n",
       "0                 1  \n",
       "21                1  \n",
       "22                1  \n",
       "33                1  \n",
       "46                1  \n",
       "...             ...  \n",
       "156047            1  \n",
       "156051            1  \n",
       "156052            1  \n",
       "156056            1  \n",
       "156057            1  \n",
       "\n",
       "[76478 rows x 5 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 4B: Taking the \"positives\" from Test 3 and running LDA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.618\n"
     ]
    }
   ],
   "source": [
    "df_pos = df[df['lda_predict'] == 1].copy()\n",
    "df_pos['PoN'] = [0 if x == 3 else 1 for x in df_pos['S0']]\n",
    "df_pos\n",
    "lda_score, p_df = get_lda(df_pos)\n",
    "print(lda_score)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## TEST 5: Checking to see if I need labels bc I'm second guessing myself here"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame()\n",
    "df['Phrase'] = train['Phrase']\n",
    "df['S0'] = train['Sentiment']\n",
    "df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### FIRST SPLIT -- trying to get the negatives"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "too many values to unpack (expected 2)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-24-7702aba4e2aa>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mneg_n_centroid\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneg_p_centroid\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_centroids\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0mscore\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_lda\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mscore\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: too many values to unpack (expected 2)"
     ]
    }
   ],
   "source": [
    "neg_n_centroid, neg_p_centroid = get_centroids(df)\n",
    "score, df = get_lda(df)\n",
    "print(score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "submission_df = get_lda_submission(df, n_centroid, p_centroid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_df['round_1'] = ['pos' if x < 1 else 'neg' for x in submission_df['lda_predict']]\n",
    "df_neg = submission_df[submission_df['lda_predict'] == 0].copy()\n",
    "df_neg['PoN'] = [0 if x == 0 else 1 for x in df_neg['S0']]\n",
    "lda_score, n_df = get_lda(df_neg)\n",
    "print(lda_score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame()\n",
    "df['Phrase'] = train['Phrase']\n",
    "df['S0'] = train['Sentiment']\n",
    "df['PoN'] = [0 if x < 3 else 1 for x in df['S0']]\n",
    "\n",
    "pos_n_centroid, pos_p_centroid = get_centroids(df)\n",
    "score, df = get_lda(df)\n",
    "print(score)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_pos = df[df['lda_predict'] == 1].copy()\n",
    "df_pos_true = df_pos[df_pos['PoN'] == df_pos['lda_predict']]\n",
    "df_pos_true"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_pos_true[df_pos_true['S0'] == 3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_pos_true[df_pos_true['S0'] == 4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "len(df_pos_true)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame()\n",
    "df['Phrase'] = df_pos_true['Phrase']\n",
    "df['S0'] = df_pos_true['S0']\n",
    "df['PoN'] = [0 if x == 3 else 1 for x in df['S0']]\n",
    "\n",
    "pos2_n_centroid, pos2_p_centroid = get_centroids(df)\n",
    "score, df = get_lda(df)\n",
    "print(score)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = get_lda_submission(df, pos2_n_centroid, pos2_p_centroid)\n",
    "print(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['actual_score'] = [3 if x == 0 else 4 for x in df['lda_predict']]\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from collections import Counter\n",
    "Counter(train['Sentiment'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_test = train.drop('Sentiment',axis=1)\n",
    "train_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_df = get_lda_submission(train_test, n_centroid, p_centroid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_df['COMPARE'] = train['Sentiment']\n",
    "submission_df\n",
    "df_neg = submission_df[submission_df['lda_predict'] == 0].copy()\n",
    "submission_df_2 = get_lda_submission(df_neg, neg_n_centroid, neg_p_centroid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# submission_df_2\n",
    "df_neg"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame()\n",
    "df['Phrase'] = train['Phrase']\n",
    "df['S0'] = train['Sentiment']\n",
    "df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]\n",
    "\n",
    "n_cent_1, p_cent_1, neg_score = get_centroids(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "neg_score"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_test = train.drop('Sentiment',axis=1)\n",
    "submission_df = get_lda_submission(train_test, n_cent_1, p_cent_1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "submission_df['lda_predict'].values.min()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_test['lda_predict'].values.max()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# train_test_neg = train_test[train_test['lda_predict'] == 0].copy()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
