In [ ]:
 
In [44]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler
In [45]:
import pandas as pd
og_df = pd.read_csv('kaggle_csv.csv')
df_sm = og_df.copy()
df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)
len(df_sm)
# len(og_df)

df = df_sm.copy()
df['Actual'] = 'tbd'
df
Out[45]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment Actual
128037 128037 128038 6887 as the main character suggests , ` what if 3 tbd
5192 5192 5193 206 well-wrought story 4 tbd
50057 50057 50058 2457 pack raw dough 2 tbd
109259 109259 109260 5785 into the editing room 2 tbd
73349 73349 73350 3748 concerned with morality 2 tbd
... ... ... ... ... ... ...
25979 25979 25980 1189 Spy 2 tbd
28724 28724 28725 1331 semi-autobiographical film 2 tbd
5064 5064 5065 198 that writer and director Burr Steers knows the... 3 tbd
85856 85856 85857 4443 associations you choose to make 2 tbd
141693 141693 141694 7686 a human volcano or 2 tbd

15606 rows × 6 columns

In [89]:
# INPUT OG DF
# OUTPUT 0,1 df and 3,4 df
# def get_small_df():

# STEP 1: GET CENTEROIDS (needed_vecs) FROM LABELED
# STEP 2: APPLY CENTEROIDS (needed_vecs) TO UNLABELED

def get_lda_submission(df, negativeness_score):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    df['lda_score'] = MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    return df

needed_vecs = []
def get_dividing_vec(df, PoN):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df['PoN'].astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    return negativity_score

def get_negatives(df):
    df['PoN'] = [0 if x < 2 else 1 for x in df['Sentiment']]
    # SPLIT ONE (into NEG)
    dividing_vec_1 = get_dividing_vec(df, 'PoN')
    needed_vecs.append(dividing_vec_1)
    df_01 = get_lda_submission(df, dividing_vec_1)
    sm_df = df_01[df_01['lda_predict'] == 0].copy()
    
    # SPLIT TWO (into 0 and 1)
    sm_df['PoN2'] = [0 if x < 1 else 1 for x in sm_df['Sentiment']]
    dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')
    needed_vecs.append(dividing_vec_2)
    df_02 = get_lda_submission(sm_df, dividing_vec_2)
    
    
#      = get_dividing_vec(df_01, 'PoN2')
    
#     print(needed_vecs)
    # split big df 
    # take 0 
    # split again
    # PRINT!
#     print(df)
    return sm_df, needed_vecs

def get_positives(df):
    df['PoN'] = [0 if x < 3 else 1 for x in df['Sentiment']]
    # SPLIT ONE (into NEG)
    dividing_vec_1 = get_dividing_vec(df, 'PoN')
    needed_vecs.append(dividing_vec_1)
    df_01 = get_lda_submission(df, dividing_vec_1)
    sm_df = df_01[df_01['lda_predict'] == 1].copy()
    
    # SPLIT TWO (into 0 and 1)
    sm_df['PoN2'] = [1 if x == 4 else 0 for x in sm_df['Sentiment']]
    dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')
    needed_vecs.append(dividing_vec_2)
    df_02 = get_lda_submission(sm_df, dividing_vec_2)
    return sm_df, needed_vecs
In [104]:
 

def get_0_1(unlabeled_df, needed_vecs):
    df_01 = get_lda_submission(unlabeled_df, needed_vecs[0])
    sm_df = df_01[df_01['lda_predict'] == 0].copy()
    df_01_b = get_lda_submission(sm_df, needed_vecs[1])
    return df_01_b

def get_3_4(unlabeled_df, needed_vecs):
    df_01 = get_lda_submission(unlabeled_df, needed_vecs[2])
    sm_df = df_01[df_01['lda_predict'] == 1].copy()
    df_01_b = get_lda_submission(sm_df, needed_vecs[3])   
    return df_01_b
In [91]:
with_negs, needed_vecs = get_negatives(df)
with_pos, needed_vecs = get_positives(df)
In [100]:
unlabeled_df = df_sm.copy()
test = get_0_1(unlabeled_df, needed_vecs)
In [102]:
len(test)
Out[102]:
1202
In [105]:
unlabeled_df = df_sm.copy()
test = get_3_4(unlabeled_df, needed_vecs)
test
Out[105]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment lda_score lda_predict
127609 127609 127610 6864 one that is dark , disturbing , painful to wat... 3 0.529864 1
112393 112393 112394 5969 a satisfying summer blockbuster and worth a look 3 0.741489 1
81788 81788 81789 4220 And how . 2 0.554698 1
38122 38122 38123 1813 reminds you of why animation is such a perfect... 4 0.586279 1
38553 38553 38554 1838 is to catch the pitch of his poetics , savor t... 3 0.472868 0
... ... ... ... ... ... ... ...
124761 124761 124762 6705 Both Garcia and Jagger turn in perfectly execu... 3 0.743028 1
138913 138913 138914 7529 has all the enjoyable randomness of a very liv... 4 0.691226 1
146161 146161 146162 7948 great scares and a good surprise ending 3 0.795357 1
35603 35603 35604 1678 to the core of what it actually means to face ... 3 0.579167 1
116843 116843 116844 6234 It is life affirming and heartbreaking , sweet... 3 0.647515 1

450 rows × 7 columns

In [85]:
from collections import Counter
Counter(with_negs['lda_predict'].values)
Out[85]:
Counter({0: 1202})
In [82]:
Counter(with_negs['Sentiment'].values)
Out[82]:
Counter({2: 305, 0: 179, 3: 208, 1: 430, 4: 80})
In [84]:
len(with_negs)
Out[84]:
1202
In [107]:
test=pd.read_csv("../WK7/kaggle-sentiment/test.tsv", delimiter='\t')
test.to_csv('kaggle_csv_test.csv')
In [113]:
df0 = get_lda_submission(unlabeled_df, needed_vecs[0])
df1 = get_lda_submission(unlabeled_df, needed_vecs[1])
df1
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-113-bdea7e048706> in <module>
      1 df0 = get_lda_submission(unlabeled_df, needed_vecs[0])
----> 2 df1 = get_lda_submission(unlabeled_df, needed_vecs[1])
      3 df1

<ipython-input-89-d4dc86660ab9> in get_lda_submission(df, negativeness_score)
      8 def get_lda_submission(df, negativeness_score):
      9     tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
---> 10     df['lda_score'] = MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))
     11     df['lda_predict'] = (df.lda_score > .5).astype(int)
     12     return df

~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   3470         else:
   3471             # set column
-> 3472             self._set_item(key, value)
   3473 
   3474     def _setitem_slice(self, key, value):

~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   3547 
   3548         self._ensure_valid_index(value)
-> 3549         value = self._sanitize_column(key, value)
   3550         NDFrame._set_item(self, key, value)
   3551 

~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value, broadcast)
   3732 
   3733             # turn me into an ndarray
-> 3734             value = sanitize_index(value, self.index, copy=False)
   3735             if not isinstance(value, (np.ndarray, Index)):
   3736                 if isinstance(value, list) and len(value) > 0:

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py in sanitize_index(data, index, copy)
    610 
    611     if len(data) != len(index):
--> 612         raise ValueError("Length of values does not match length of index")
    613 
    614     if isinstance(data, ABCIndexClass) and not copy:

ValueError: Length of values does not match length of index
In [112]:
with_pred_label = unlabeled_df.copy()
with_pred_label['actual'] = [0 if x == 0 else 'tbd' for x in df0['lda_predict']]
with_pred_label
Out[112]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment lda_score lda_predict actual
128037 128037 128038 6887 as the main character suggests , ` what if 3 0.625832 1 tbd
5192 5192 5193 206 well-wrought story 4 0.870760 1 tbd
50057 50057 50058 2457 pack raw dough 2 0.858039 1 tbd
109259 109259 109260 5785 into the editing room 2 0.769221 1 tbd
73349 73349 73350 3748 concerned with morality 2 0.863717 1 tbd
... ... ... ... ... ... ... ... ...
25979 25979 25980 1189 Spy 2 0.882761 1 tbd
28724 28724 28725 1331 semi-autobiographical film 2 0.873131 1 tbd
5064 5064 5065 198 that writer and director Burr Steers knows the... 3 0.803085 1 tbd
85856 85856 85857 4443 associations you choose to make 2 0.738841 1 tbd
141693 141693 141694 7686 a human volcano or 2 0.762257 1 tbd

15606 rows × 8 columns

In [117]:
len(needed_vecs[3])
Out[117]:
450
In [ ]: