{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from nltk.tokenize.casual import casual_tokenize\n",
    "tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)\n",
    "from sklearn.preprocessing import MinMaxScaler"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>Actual</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>128037</td>\n",
       "      <td>128037</td>\n",
       "      <td>128038</td>\n",
       "      <td>6887</td>\n",
       "      <td>as the main character suggests , ` what if</td>\n",
       "      <td>3</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5192</td>\n",
       "      <td>5192</td>\n",
       "      <td>5193</td>\n",
       "      <td>206</td>\n",
       "      <td>well-wrought story</td>\n",
       "      <td>4</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>50057</td>\n",
       "      <td>50057</td>\n",
       "      <td>50058</td>\n",
       "      <td>2457</td>\n",
       "      <td>pack raw dough</td>\n",
       "      <td>2</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>109259</td>\n",
       "      <td>109259</td>\n",
       "      <td>109260</td>\n",
       "      <td>5785</td>\n",
       "      <td>into the editing room</td>\n",
       "      <td>2</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>73349</td>\n",
       "      <td>73349</td>\n",
       "      <td>73350</td>\n",
       "      <td>3748</td>\n",
       "      <td>concerned with morality</td>\n",
       "      <td>2</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25979</td>\n",
       "      <td>25979</td>\n",
       "      <td>25980</td>\n",
       "      <td>1189</td>\n",
       "      <td>Spy</td>\n",
       "      <td>2</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28724</td>\n",
       "      <td>28724</td>\n",
       "      <td>28725</td>\n",
       "      <td>1331</td>\n",
       "      <td>semi-autobiographical film</td>\n",
       "      <td>2</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5064</td>\n",
       "      <td>5064</td>\n",
       "      <td>5065</td>\n",
       "      <td>198</td>\n",
       "      <td>that writer and director Burr Steers knows the...</td>\n",
       "      <td>3</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>85856</td>\n",
       "      <td>85856</td>\n",
       "      <td>85857</td>\n",
       "      <td>4443</td>\n",
       "      <td>associations you choose to make</td>\n",
       "      <td>2</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>141693</td>\n",
       "      <td>141693</td>\n",
       "      <td>141694</td>\n",
       "      <td>7686</td>\n",
       "      <td>a human volcano or</td>\n",
       "      <td>2</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>15606 rows × 6 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0  PhraseId  SentenceId  \\\n",
       "128037      128037    128038        6887   \n",
       "5192          5192      5193         206   \n",
       "50057        50057     50058        2457   \n",
       "109259      109259    109260        5785   \n",
       "73349        73349     73350        3748   \n",
       "...            ...       ...         ...   \n",
       "25979        25979     25980        1189   \n",
       "28724        28724     28725        1331   \n",
       "5064          5064      5065         198   \n",
       "85856        85856     85857        4443   \n",
       "141693      141693    141694        7686   \n",
       "\n",
       "                                                   Phrase  Sentiment Actual  \n",
       "128037         as the main character suggests , ` what if          3    tbd  \n",
       "5192                                   well-wrought story          4    tbd  \n",
       "50057                                      pack raw dough          2    tbd  \n",
       "109259                              into the editing room          2    tbd  \n",
       "73349                             concerned with morality          2    tbd  \n",
       "...                                                   ...        ...    ...  \n",
       "25979                                                 Spy          2    tbd  \n",
       "28724                          semi-autobiographical film          2    tbd  \n",
       "5064    that writer and director Burr Steers knows the...          3    tbd  \n",
       "85856                     associations you choose to make          2    tbd  \n",
       "141693                                 a human volcano or          2    tbd  \n",
       "\n",
       "[15606 rows x 6 columns]"
      ]
     },
     "execution_count": 45,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import pandas as pd\n",
    "og_df = pd.read_csv('kaggle_csv.csv')\n",
    "df_sm = og_df.copy()\n",
    "df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)\n",
    "len(df_sm)\n",
    "# len(og_df)\n",
    "\n",
    "df = df_sm.copy()\n",
    "df['Actual'] = 'tbd'\n",
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 89,
   "metadata": {},
   "outputs": [],
   "source": [
    "# INPUT OG DF\n",
    "# OUTPUT 0,1 df and 3,4 df\n",
    "# def get_small_df():\n",
    "\n",
    "# STEP 1: GET CENTEROIDS (needed_vecs) FROM LABELED\n",
    "# STEP 2: APPLY CENTEROIDS (needed_vecs) TO UNLABELED\n",
    "\n",
    "def get_lda_submission(df, negativeness_score):\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    df['lda_score'] = MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))\n",
    "    df['lda_predict'] = (df.lda_score > .5).astype(int)\n",
    "    return df\n",
    "\n",
    "needed_vecs = []\n",
    "def get_dividing_vec(df, PoN):\n",
    "    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n",
    "    mask = df['PoN'].astype(bool).values\n",
    "    n_centroid = tfidf_docs[mask].mean(axis=0)\n",
    "    p_centroid = tfidf_docs[~mask].mean(axis=0)\n",
    "    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n",
    "    return negativity_score\n",
    "\n",
    "def get_negatives(df):\n",
    "    df['PoN'] = [0 if x < 2 else 1 for x in df['Sentiment']]\n",
    "    # SPLIT ONE (into NEG)\n",
    "    dividing_vec_1 = get_dividing_vec(df, 'PoN')\n",
    "    needed_vecs.append(dividing_vec_1)\n",
    "    df_01 = get_lda_submission(df, dividing_vec_1)\n",
    "    sm_df = df_01[df_01['lda_predict'] == 0].copy()\n",
    "    \n",
    "    # SPLIT TWO (into 0 and 1)\n",
    "    sm_df['PoN2'] = [0 if x < 1 else 1 for x in sm_df['Sentiment']]\n",
    "    dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')\n",
    "    needed_vecs.append(dividing_vec_2)\n",
    "    df_02 = get_lda_submission(sm_df, dividing_vec_2)\n",
    "    \n",
    "    \n",
    "#      = get_dividing_vec(df_01, 'PoN2')\n",
    "    \n",
    "#     print(needed_vecs)\n",
    "    # split big df \n",
    "    # take 0 \n",
    "    # split again\n",
    "    # PRINT!\n",
    "#     print(df)\n",
    "    return sm_df, needed_vecs\n",
    "\n",
    "def get_positives(df):\n",
    "    df['PoN'] = [0 if x < 3 else 1 for x in df['Sentiment']]\n",
    "    # SPLIT ONE (into NEG)\n",
    "    dividing_vec_1 = get_dividing_vec(df, 'PoN')\n",
    "    needed_vecs.append(dividing_vec_1)\n",
    "    df_01 = get_lda_submission(df, dividing_vec_1)\n",
    "    sm_df = df_01[df_01['lda_predict'] == 1].copy()\n",
    "    \n",
    "    # SPLIT TWO (into 0 and 1)\n",
    "    sm_df['PoN2'] = [1 if x == 4 else 0 for x in sm_df['Sentiment']]\n",
    "    dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')\n",
    "    needed_vecs.append(dividing_vec_2)\n",
    "    df_02 = get_lda_submission(sm_df, dividing_vec_2)\n",
    "    return sm_df, needed_vecs\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 104,
   "metadata": {},
   "outputs": [],
   "source": [
    " \n",
    "\n",
    "def get_0_1(unlabeled_df, needed_vecs):\n",
    "    df_01 = get_lda_submission(unlabeled_df, needed_vecs[0])\n",
    "    sm_df = df_01[df_01['lda_predict'] == 0].copy()\n",
    "    df_01_b = get_lda_submission(sm_df, needed_vecs[1])\n",
    "    return df_01_b\n",
    "\n",
    "def get_3_4(unlabeled_df, needed_vecs):\n",
    "    df_01 = get_lda_submission(unlabeled_df, needed_vecs[2])\n",
    "    sm_df = df_01[df_01['lda_predict'] == 1].copy()\n",
    "    df_01_b = get_lda_submission(sm_df, needed_vecs[3])   \n",
    "    return df_01_b"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 91,
   "metadata": {},
   "outputs": [],
   "source": [
    "with_negs, needed_vecs = get_negatives(df)\n",
    "with_pos, needed_vecs = get_positives(df)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 100,
   "metadata": {},
   "outputs": [],
   "source": [
    "unlabeled_df = df_sm.copy()\n",
    "test = get_0_1(unlabeled_df, needed_vecs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1202"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 105,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>lda_score</th>\n",
       "      <th>lda_predict</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>127609</td>\n",
       "      <td>127609</td>\n",
       "      <td>127610</td>\n",
       "      <td>6864</td>\n",
       "      <td>one that is dark , disturbing , painful to wat...</td>\n",
       "      <td>3</td>\n",
       "      <td>0.529864</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>112393</td>\n",
       "      <td>112393</td>\n",
       "      <td>112394</td>\n",
       "      <td>5969</td>\n",
       "      <td>a satisfying summer blockbuster and worth a look</td>\n",
       "      <td>3</td>\n",
       "      <td>0.741489</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>81788</td>\n",
       "      <td>81788</td>\n",
       "      <td>81789</td>\n",
       "      <td>4220</td>\n",
       "      <td>And how .</td>\n",
       "      <td>2</td>\n",
       "      <td>0.554698</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>38122</td>\n",
       "      <td>38122</td>\n",
       "      <td>38123</td>\n",
       "      <td>1813</td>\n",
       "      <td>reminds you of why animation is such a perfect...</td>\n",
       "      <td>4</td>\n",
       "      <td>0.586279</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>38553</td>\n",
       "      <td>38553</td>\n",
       "      <td>38554</td>\n",
       "      <td>1838</td>\n",
       "      <td>is to catch the pitch of his poetics , savor t...</td>\n",
       "      <td>3</td>\n",
       "      <td>0.472868</td>\n",
       "      <td>0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>124761</td>\n",
       "      <td>124761</td>\n",
       "      <td>124762</td>\n",
       "      <td>6705</td>\n",
       "      <td>Both Garcia and Jagger turn in perfectly execu...</td>\n",
       "      <td>3</td>\n",
       "      <td>0.743028</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>138913</td>\n",
       "      <td>138913</td>\n",
       "      <td>138914</td>\n",
       "      <td>7529</td>\n",
       "      <td>has all the enjoyable randomness of a very liv...</td>\n",
       "      <td>4</td>\n",
       "      <td>0.691226</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>146161</td>\n",
       "      <td>146161</td>\n",
       "      <td>146162</td>\n",
       "      <td>7948</td>\n",
       "      <td>great scares and a good surprise ending</td>\n",
       "      <td>3</td>\n",
       "      <td>0.795357</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>35603</td>\n",
       "      <td>35603</td>\n",
       "      <td>35604</td>\n",
       "      <td>1678</td>\n",
       "      <td>to the core of what it actually means to face ...</td>\n",
       "      <td>3</td>\n",
       "      <td>0.579167</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>116843</td>\n",
       "      <td>116843</td>\n",
       "      <td>116844</td>\n",
       "      <td>6234</td>\n",
       "      <td>It is life affirming and heartbreaking , sweet...</td>\n",
       "      <td>3</td>\n",
       "      <td>0.647515</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>450 rows × 7 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0  PhraseId  SentenceId  \\\n",
       "127609      127609    127610        6864   \n",
       "112393      112393    112394        5969   \n",
       "81788        81788     81789        4220   \n",
       "38122        38122     38123        1813   \n",
       "38553        38553     38554        1838   \n",
       "...            ...       ...         ...   \n",
       "124761      124761    124762        6705   \n",
       "138913      138913    138914        7529   \n",
       "146161      146161    146162        7948   \n",
       "35603        35603     35604        1678   \n",
       "116843      116843    116844        6234   \n",
       "\n",
       "                                                   Phrase  Sentiment  \\\n",
       "127609  one that is dark , disturbing , painful to wat...          3   \n",
       "112393   a satisfying summer blockbuster and worth a look          3   \n",
       "81788                                           And how .          2   \n",
       "38122   reminds you of why animation is such a perfect...          4   \n",
       "38553   is to catch the pitch of his poetics , savor t...          3   \n",
       "...                                                   ...        ...   \n",
       "124761  Both Garcia and Jagger turn in perfectly execu...          3   \n",
       "138913  has all the enjoyable randomness of a very liv...          4   \n",
       "146161            great scares and a good surprise ending          3   \n",
       "35603   to the core of what it actually means to face ...          3   \n",
       "116843  It is life affirming and heartbreaking , sweet...          3   \n",
       "\n",
       "        lda_score  lda_predict  \n",
       "127609   0.529864            1  \n",
       "112393   0.741489            1  \n",
       "81788    0.554698            1  \n",
       "38122    0.586279            1  \n",
       "38553    0.472868            0  \n",
       "...           ...          ...  \n",
       "124761   0.743028            1  \n",
       "138913   0.691226            1  \n",
       "146161   0.795357            1  \n",
       "35603    0.579167            1  \n",
       "116843   0.647515            1  \n",
       "\n",
       "[450 rows x 7 columns]"
      ]
     },
     "execution_count": 105,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "unlabeled_df = df_sm.copy()\n",
    "test = get_3_4(unlabeled_df, needed_vecs)\n",
    "test"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 85,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({0: 1202})"
      ]
     },
     "execution_count": 85,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "from collections import Counter\n",
    "Counter(with_negs['lda_predict'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Counter({2: 305, 0: 179, 3: 208, 1: 430, 4: 80})"
      ]
     },
     "execution_count": 82,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "Counter(with_negs['Sentiment'].values)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 84,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1202"
      ]
     },
     "execution_count": 84,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(with_negs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 107,
   "metadata": {},
   "outputs": [],
   "source": [
    "test=pd.read_csv(\"../WK7/kaggle-sentiment/test.tsv\", delimiter='\\t')\n",
    "test.to_csv('kaggle_csv_test.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 113,
   "metadata": {},
   "outputs": [
    {
     "ename": "ValueError",
     "evalue": "Length of values does not match length of index",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mValueError\u001b[0m                                Traceback (most recent call last)",
      "\u001b[0;32m<ipython-input-113-bdea7e048706>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[0mdf0\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_lda_submission\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0munlabeled_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneeded_vecs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_lda_submission\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0munlabeled_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneeded_vecs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      3\u001b[0m \u001b[0mdf1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m<ipython-input-89-d4dc86660ab9>\u001b[0m in \u001b[0;36mget_lda_submission\u001b[0;34m(df, negativeness_score)\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_lda_submission\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnegativeness_score\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m     \u001b[0mtfidf_docs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtfidf_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_documents\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Phrase'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m     \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'lda_score'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMinMaxScaler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnegativeness_score\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     11\u001b[0m     \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'lda_predict'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlda_score\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m.5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     12\u001b[0m     \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m   3470\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3471\u001b[0m             \u001b[0;31m# set column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3472\u001b[0;31m             \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3473\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3474\u001b[0m     \u001b[0;32mdef\u001b[0m \u001b[0m_setitem_slice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_set_item\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m   3547\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3548\u001b[0m         \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ensure_valid_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3549\u001b[0;31m         \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sanitize_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3550\u001b[0m         \u001b[0mNDFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_sanitize_column\u001b[0;34m(self, key, value, broadcast)\u001b[0m\n\u001b[1;32m   3732\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3733\u001b[0m             \u001b[0;31m# turn me into an ndarray\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3734\u001b[0;31m             \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msanitize_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m   3735\u001b[0m             \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m   3736\u001b[0m                 \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py\u001b[0m in \u001b[0;36msanitize_index\u001b[0;34m(data, index, copy)\u001b[0m\n\u001b[1;32m    610\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    611\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 612\u001b[0;31m         \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Length of values does not match length of index\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    614\u001b[0m     \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mABCIndexClass\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
      "\u001b[0;31mValueError\u001b[0m: Length of values does not match length of index"
     ]
    }
   ],
   "source": [
    "df0 = get_lda_submission(unlabeled_df, needed_vecs[0])\n",
    "df1 = get_lda_submission(unlabeled_df, needed_vecs[1])\n",
    "df1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 112,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>PhraseId</th>\n",
       "      <th>SentenceId</th>\n",
       "      <th>Phrase</th>\n",
       "      <th>Sentiment</th>\n",
       "      <th>lda_score</th>\n",
       "      <th>lda_predict</th>\n",
       "      <th>actual</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>128037</td>\n",
       "      <td>128037</td>\n",
       "      <td>128038</td>\n",
       "      <td>6887</td>\n",
       "      <td>as the main character suggests , ` what if</td>\n",
       "      <td>3</td>\n",
       "      <td>0.625832</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5192</td>\n",
       "      <td>5192</td>\n",
       "      <td>5193</td>\n",
       "      <td>206</td>\n",
       "      <td>well-wrought story</td>\n",
       "      <td>4</td>\n",
       "      <td>0.870760</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>50057</td>\n",
       "      <td>50057</td>\n",
       "      <td>50058</td>\n",
       "      <td>2457</td>\n",
       "      <td>pack raw dough</td>\n",
       "      <td>2</td>\n",
       "      <td>0.858039</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>109259</td>\n",
       "      <td>109259</td>\n",
       "      <td>109260</td>\n",
       "      <td>5785</td>\n",
       "      <td>into the editing room</td>\n",
       "      <td>2</td>\n",
       "      <td>0.769221</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>73349</td>\n",
       "      <td>73349</td>\n",
       "      <td>73350</td>\n",
       "      <td>3748</td>\n",
       "      <td>concerned with morality</td>\n",
       "      <td>2</td>\n",
       "      <td>0.863717</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25979</td>\n",
       "      <td>25979</td>\n",
       "      <td>25980</td>\n",
       "      <td>1189</td>\n",
       "      <td>Spy</td>\n",
       "      <td>2</td>\n",
       "      <td>0.882761</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28724</td>\n",
       "      <td>28724</td>\n",
       "      <td>28725</td>\n",
       "      <td>1331</td>\n",
       "      <td>semi-autobiographical film</td>\n",
       "      <td>2</td>\n",
       "      <td>0.873131</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5064</td>\n",
       "      <td>5064</td>\n",
       "      <td>5065</td>\n",
       "      <td>198</td>\n",
       "      <td>that writer and director Burr Steers knows the...</td>\n",
       "      <td>3</td>\n",
       "      <td>0.803085</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>85856</td>\n",
       "      <td>85856</td>\n",
       "      <td>85857</td>\n",
       "      <td>4443</td>\n",
       "      <td>associations you choose to make</td>\n",
       "      <td>2</td>\n",
       "      <td>0.738841</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>141693</td>\n",
       "      <td>141693</td>\n",
       "      <td>141694</td>\n",
       "      <td>7686</td>\n",
       "      <td>a human volcano or</td>\n",
       "      <td>2</td>\n",
       "      <td>0.762257</td>\n",
       "      <td>1</td>\n",
       "      <td>tbd</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>15606 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Unnamed: 0  PhraseId  SentenceId  \\\n",
       "128037      128037    128038        6887   \n",
       "5192          5192      5193         206   \n",
       "50057        50057     50058        2457   \n",
       "109259      109259    109260        5785   \n",
       "73349        73349     73350        3748   \n",
       "...            ...       ...         ...   \n",
       "25979        25979     25980        1189   \n",
       "28724        28724     28725        1331   \n",
       "5064          5064      5065         198   \n",
       "85856        85856     85857        4443   \n",
       "141693      141693    141694        7686   \n",
       "\n",
       "                                                   Phrase  Sentiment  \\\n",
       "128037         as the main character suggests , ` what if          3   \n",
       "5192                                   well-wrought story          4   \n",
       "50057                                      pack raw dough          2   \n",
       "109259                              into the editing room          2   \n",
       "73349                             concerned with morality          2   \n",
       "...                                                   ...        ...   \n",
       "25979                                                 Spy          2   \n",
       "28724                          semi-autobiographical film          2   \n",
       "5064    that writer and director Burr Steers knows the...          3   \n",
       "85856                     associations you choose to make          2   \n",
       "141693                                 a human volcano or          2   \n",
       "\n",
       "        lda_score  lda_predict actual  \n",
       "128037   0.625832            1    tbd  \n",
       "5192     0.870760            1    tbd  \n",
       "50057    0.858039            1    tbd  \n",
       "109259   0.769221            1    tbd  \n",
       "73349    0.863717            1    tbd  \n",
       "...           ...          ...    ...  \n",
       "25979    0.882761            1    tbd  \n",
       "28724    0.873131            1    tbd  \n",
       "5064     0.803085            1    tbd  \n",
       "85856    0.738841            1    tbd  \n",
       "141693   0.762257            1    tbd  \n",
       "\n",
       "[15606 rows x 8 columns]"
      ]
     },
     "execution_count": 112,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "with_pred_label = unlabeled_df.copy()\n",
    "with_pred_label['actual'] = [0 if x == 0 else 'tbd' for x in df0['lda_predict']]\n",
    "with_pred_label"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 117,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "450"
      ]
     },
     "execution_count": 117,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(needed_vecs[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
