{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from nltk.tokenize.casual import casual_tokenize\n", "tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)\n", "from sklearn.preprocessing import MinMaxScaler" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PhraseIdSentenceIdPhraseSentimentActual
1280371280371280386887as the main character suggests , ` what if3tbd
519251925193206well-wrought story4tbd
5005750057500582457pack raw dough2tbd
1092591092591092605785into the editing room2tbd
7334973349733503748concerned with morality2tbd
.....................
2597925979259801189Spy2tbd
2872428724287251331semi-autobiographical film2tbd
506450645065198that writer and director Burr Steers knows the...3tbd
8585685856858574443associations you choose to make2tbd
1416931416931416947686a human volcano or2tbd
\n", "

15606 rows × 6 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 PhraseId SentenceId \\\n", "128037 128037 128038 6887 \n", "5192 5192 5193 206 \n", "50057 50057 50058 2457 \n", "109259 109259 109260 5785 \n", "73349 73349 73350 3748 \n", "... ... ... ... \n", "25979 25979 25980 1189 \n", "28724 28724 28725 1331 \n", "5064 5064 5065 198 \n", "85856 85856 85857 4443 \n", "141693 141693 141694 7686 \n", "\n", " Phrase Sentiment Actual \n", "128037 as the main character suggests , ` what if 3 tbd \n", "5192 well-wrought story 4 tbd \n", "50057 pack raw dough 2 tbd \n", "109259 into the editing room 2 tbd \n", "73349 concerned with morality 2 tbd \n", "... ... ... ... \n", "25979 Spy 2 tbd \n", "28724 semi-autobiographical film 2 tbd \n", "5064 that writer and director Burr Steers knows the... 3 tbd \n", "85856 associations you choose to make 2 tbd \n", "141693 a human volcano or 2 tbd \n", "\n", "[15606 rows x 6 columns]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import pandas as pd\n", "og_df = pd.read_csv('kaggle_csv.csv')\n", "df_sm = og_df.copy()\n", "df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)\n", "len(df_sm)\n", "# len(og_df)\n", "\n", "df = df_sm.copy()\n", "df['Actual'] = 'tbd'\n", "df" ] }, { "cell_type": "code", "execution_count": 89, "metadata": {}, "outputs": [], "source": [ "# INPUT OG DF\n", "# OUTPUT 0,1 df and 3,4 df\n", "# def get_small_df():\n", "\n", "# STEP 1: GET CENTEROIDS (needed_vecs) FROM LABELED\n", "# STEP 2: APPLY CENTEROIDS (needed_vecs) TO UNLABELED\n", "\n", "def get_lda_submission(df, negativeness_score):\n", " tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n", " df['lda_score'] = MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))\n", " df['lda_predict'] = (df.lda_score > .5).astype(int)\n", " return df\n", "\n", "needed_vecs = []\n", "def get_dividing_vec(df, PoN):\n", " tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()\n", " mask = df['PoN'].astype(bool).values\n", " n_centroid = tfidf_docs[mask].mean(axis=0)\n", " p_centroid = tfidf_docs[~mask].mean(axis=0)\n", " negativity_score = tfidf_docs.dot(n_centroid - p_centroid)\n", " return negativity_score\n", "\n", "def get_negatives(df):\n", " df['PoN'] = [0 if x < 2 else 1 for x in df['Sentiment']]\n", " # SPLIT ONE (into NEG)\n", " dividing_vec_1 = get_dividing_vec(df, 'PoN')\n", " needed_vecs.append(dividing_vec_1)\n", " df_01 = get_lda_submission(df, dividing_vec_1)\n", " sm_df = df_01[df_01['lda_predict'] == 0].copy()\n", " \n", " # SPLIT TWO (into 0 and 1)\n", " sm_df['PoN2'] = [0 if x < 1 else 1 for x in sm_df['Sentiment']]\n", " dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')\n", " needed_vecs.append(dividing_vec_2)\n", " df_02 = get_lda_submission(sm_df, dividing_vec_2)\n", " \n", " \n", "# = get_dividing_vec(df_01, 'PoN2')\n", " \n", "# print(needed_vecs)\n", " # split big df \n", " # take 0 \n", " # split again\n", " # PRINT!\n", "# print(df)\n", " return sm_df, needed_vecs\n", "\n", "def get_positives(df):\n", " df['PoN'] = [0 if x < 3 else 1 for x in df['Sentiment']]\n", " # SPLIT ONE (into NEG)\n", " dividing_vec_1 = get_dividing_vec(df, 'PoN')\n", " needed_vecs.append(dividing_vec_1)\n", " df_01 = get_lda_submission(df, dividing_vec_1)\n", " sm_df = df_01[df_01['lda_predict'] == 1].copy()\n", " \n", " # SPLIT TWO (into 0 and 1)\n", " sm_df['PoN2'] = [1 if x == 4 else 0 for x in sm_df['Sentiment']]\n", " dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')\n", " needed_vecs.append(dividing_vec_2)\n", " df_02 = get_lda_submission(sm_df, dividing_vec_2)\n", " return sm_df, needed_vecs\n" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [], "source": [ " \n", "\n", "def get_0_1(unlabeled_df, needed_vecs):\n", " df_01 = get_lda_submission(unlabeled_df, needed_vecs[0])\n", " sm_df = df_01[df_01['lda_predict'] == 0].copy()\n", " df_01_b = get_lda_submission(sm_df, needed_vecs[1])\n", " return df_01_b\n", "\n", "def get_3_4(unlabeled_df, needed_vecs):\n", " df_01 = get_lda_submission(unlabeled_df, needed_vecs[2])\n", " sm_df = df_01[df_01['lda_predict'] == 1].copy()\n", " df_01_b = get_lda_submission(sm_df, needed_vecs[3]) \n", " return df_01_b" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [], "source": [ "with_negs, needed_vecs = get_negatives(df)\n", "with_pos, needed_vecs = get_positives(df)\n" ] }, { "cell_type": "code", "execution_count": 100, "metadata": {}, "outputs": [], "source": [ "unlabeled_df = df_sm.copy()\n", "test = get_0_1(unlabeled_df, needed_vecs)" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1202" ] }, "execution_count": 102, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(test)" ] }, { "cell_type": "code", "execution_count": 105, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PhraseIdSentenceIdPhraseSentimentlda_scorelda_predict
1276091276091276106864one that is dark , disturbing , painful to wat...30.5298641
1123931123931123945969a satisfying summer blockbuster and worth a look30.7414891
8178881788817894220And how .20.5546981
3812238122381231813reminds you of why animation is such a perfect...40.5862791
3855338553385541838is to catch the pitch of his poetics , savor t...30.4728680
........................
1247611247611247626705Both Garcia and Jagger turn in perfectly execu...30.7430281
1389131389131389147529has all the enjoyable randomness of a very liv...40.6912261
1461611461611461627948great scares and a good surprise ending30.7953571
3560335603356041678to the core of what it actually means to face ...30.5791671
1168431168431168446234It is life affirming and heartbreaking , sweet...30.6475151
\n", "

450 rows × 7 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 PhraseId SentenceId \\\n", "127609 127609 127610 6864 \n", "112393 112393 112394 5969 \n", "81788 81788 81789 4220 \n", "38122 38122 38123 1813 \n", "38553 38553 38554 1838 \n", "... ... ... ... \n", "124761 124761 124762 6705 \n", "138913 138913 138914 7529 \n", "146161 146161 146162 7948 \n", "35603 35603 35604 1678 \n", "116843 116843 116844 6234 \n", "\n", " Phrase Sentiment \\\n", "127609 one that is dark , disturbing , painful to wat... 3 \n", "112393 a satisfying summer blockbuster and worth a look 3 \n", "81788 And how . 2 \n", "38122 reminds you of why animation is such a perfect... 4 \n", "38553 is to catch the pitch of his poetics , savor t... 3 \n", "... ... ... \n", "124761 Both Garcia and Jagger turn in perfectly execu... 3 \n", "138913 has all the enjoyable randomness of a very liv... 4 \n", "146161 great scares and a good surprise ending 3 \n", "35603 to the core of what it actually means to face ... 3 \n", "116843 It is life affirming and heartbreaking , sweet... 3 \n", "\n", " lda_score lda_predict \n", "127609 0.529864 1 \n", "112393 0.741489 1 \n", "81788 0.554698 1 \n", "38122 0.586279 1 \n", "38553 0.472868 0 \n", "... ... ... \n", "124761 0.743028 1 \n", "138913 0.691226 1 \n", "146161 0.795357 1 \n", "35603 0.579167 1 \n", "116843 0.647515 1 \n", "\n", "[450 rows x 7 columns]" ] }, "execution_count": 105, "metadata": {}, "output_type": "execute_result" } ], "source": [ "unlabeled_df = df_sm.copy()\n", "test = get_3_4(unlabeled_df, needed_vecs)\n", "test" ] }, { "cell_type": "code", "execution_count": 85, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Counter({0: 1202})" ] }, "execution_count": 85, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from collections import Counter\n", "Counter(with_negs['lda_predict'].values)" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Counter({2: 305, 0: 179, 3: 208, 1: 430, 4: 80})" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "Counter(with_negs['Sentiment'].values)" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "1202" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(with_negs)" ] }, { "cell_type": "code", "execution_count": 107, "metadata": {}, "outputs": [], "source": [ "test=pd.read_csv(\"../WK7/kaggle-sentiment/test.tsv\", delimiter='\\t')\n", "test.to_csv('kaggle_csv_test.csv')" ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [ { "ename": "ValueError", "evalue": "Length of values does not match length of index", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mdf0\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_lda_submission\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0munlabeled_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneeded_vecs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdf1\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mget_lda_submission\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0munlabeled_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mneeded_vecs\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3\u001b[0m \u001b[0mdf1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m\u001b[0m in \u001b[0;36mget_lda_submission\u001b[0;34m(df, negativeness_score)\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_lda_submission\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnegativeness_score\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m \u001b[0mtfidf_docs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtfidf_model\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mraw_documents\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Phrase'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtoarray\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'lda_score'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mMinMaxScaler\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit_transform\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnegativeness_score\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreshape\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 11\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'lda_predict'\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mlda_score\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m.5\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mint\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 12\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m__setitem__\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 3470\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3471\u001b[0m \u001b[0;31m# set column\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3472\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3473\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3474\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m_setitem_slice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_set_item\u001b[0;34m(self, key, value)\u001b[0m\n\u001b[1;32m 3547\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3548\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_ensure_valid_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3549\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_sanitize_column\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3550\u001b[0m \u001b[0mNDFrame\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_set_item\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkey\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3551\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py\u001b[0m in \u001b[0;36m_sanitize_column\u001b[0;34m(self, key, value, broadcast)\u001b[0m\n\u001b[1;32m 3732\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3733\u001b[0m \u001b[0;31m# turn me into an ndarray\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 3734\u001b[0;31m \u001b[0mvalue\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msanitize_index\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 3735\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mndarray\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mIndex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3736\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mvalue\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m>\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py\u001b[0m in \u001b[0;36msanitize_index\u001b[0;34m(data, index, copy)\u001b[0m\n\u001b[1;32m 610\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 611\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 612\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mValueError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Length of values does not match length of index\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 613\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 614\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdata\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mABCIndexClass\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mcopy\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mValueError\u001b[0m: Length of values does not match length of index" ] } ], "source": [ "df0 = get_lda_submission(unlabeled_df, needed_vecs[0])\n", "df1 = get_lda_submission(unlabeled_df, needed_vecs[1])\n", "df1" ] }, { "cell_type": "code", "execution_count": 112, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0PhraseIdSentenceIdPhraseSentimentlda_scorelda_predictactual
1280371280371280386887as the main character suggests , ` what if30.6258321tbd
519251925193206well-wrought story40.8707601tbd
5005750057500582457pack raw dough20.8580391tbd
1092591092591092605785into the editing room20.7692211tbd
7334973349733503748concerned with morality20.8637171tbd
...........................
2597925979259801189Spy20.8827611tbd
2872428724287251331semi-autobiographical film20.8731311tbd
506450645065198that writer and director Burr Steers knows the...30.8030851tbd
8585685856858574443associations you choose to make20.7388411tbd
1416931416931416947686a human volcano or20.7622571tbd
\n", "

15606 rows × 8 columns

\n", "
" ], "text/plain": [ " Unnamed: 0 PhraseId SentenceId \\\n", "128037 128037 128038 6887 \n", "5192 5192 5193 206 \n", "50057 50057 50058 2457 \n", "109259 109259 109260 5785 \n", "73349 73349 73350 3748 \n", "... ... ... ... \n", "25979 25979 25980 1189 \n", "28724 28724 28725 1331 \n", "5064 5064 5065 198 \n", "85856 85856 85857 4443 \n", "141693 141693 141694 7686 \n", "\n", " Phrase Sentiment \\\n", "128037 as the main character suggests , ` what if 3 \n", "5192 well-wrought story 4 \n", "50057 pack raw dough 2 \n", "109259 into the editing room 2 \n", "73349 concerned with morality 2 \n", "... ... ... \n", "25979 Spy 2 \n", "28724 semi-autobiographical film 2 \n", "5064 that writer and director Burr Steers knows the... 3 \n", "85856 associations you choose to make 2 \n", "141693 a human volcano or 2 \n", "\n", " lda_score lda_predict actual \n", "128037 0.625832 1 tbd \n", "5192 0.870760 1 tbd \n", "50057 0.858039 1 tbd \n", "109259 0.769221 1 tbd \n", "73349 0.863717 1 tbd \n", "... ... ... ... \n", "25979 0.882761 1 tbd \n", "28724 0.873131 1 tbd \n", "5064 0.803085 1 tbd \n", "85856 0.738841 1 tbd \n", "141693 0.762257 1 tbd \n", "\n", "[15606 rows x 8 columns]" ] }, "execution_count": 112, "metadata": {}, "output_type": "execute_result" } ], "source": [ "with_pred_label = unlabeled_df.copy()\n", "with_pred_label['actual'] = [0 if x == 0 else 'tbd' for x in df0['lda_predict']]\n", "with_pred_label" ] }, { "cell_type": "code", "execution_count": 117, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "450" ] }, "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len(needed_vecs[3])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }