from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler

import pandas as pd
og_df = pd.read_csv('kaggle_csv.csv')
df_sm = og_df.copy()
df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)
len(df_sm)
# len(og_df)

df = df_sm.copy()
df['Actual'] = 'tbd'
df

# INPUT OG DF
# OUTPUT 0,1 df and 3,4 df
# def get_small_df():

# STEP 1: GET CENTEROIDS (needed_vecs) FROM LABELED
# STEP 2: APPLY CENTEROIDS (needed_vecs) TO UNLABELED

def get_lda_submission(df, negativeness_score):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    df['lda_score'] = MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    return df

needed_vecs = []
def get_dividing_vec(df, PoN):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df['PoN'].astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    return negativity_score

def get_negatives(df):
    df['PoN'] = [0 if x < 2 else 1 for x in df['Sentiment']]
    # SPLIT ONE (into NEG)
    dividing_vec_1 = get_dividing_vec(df, 'PoN')
    needed_vecs.append(dividing_vec_1)
    df_01 = get_lda_submission(df, dividing_vec_1)
    sm_df = df_01[df_01['lda_predict'] == 0].copy()
    
    # SPLIT TWO (into 0 and 1)
    sm_df['PoN2'] = [0 if x < 1 else 1 for x in sm_df['Sentiment']]
    dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')
    needed_vecs.append(dividing_vec_2)
    df_02 = get_lda_submission(sm_df, dividing_vec_2)
    
    
#      = get_dividing_vec(df_01, 'PoN2')
    
#     print(needed_vecs)
    # split big df 
    # take 0 
    # split again
    # PRINT!
#     print(df)
    return sm_df, needed_vecs

def get_positives(df):
    df['PoN'] = [0 if x < 3 else 1 for x in df['Sentiment']]
    # SPLIT ONE (into NEG)
    dividing_vec_1 = get_dividing_vec(df, 'PoN')
    needed_vecs.append(dividing_vec_1)
    df_01 = get_lda_submission(df, dividing_vec_1)
    sm_df = df_01[df_01['lda_predict'] == 1].copy()
    
    # SPLIT TWO (into 0 and 1)
    sm_df['PoN2'] = [1 if x == 4 else 0 for x in sm_df['Sentiment']]
    dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')
    needed_vecs.append(dividing_vec_2)
    df_02 = get_lda_submission(sm_df, dividing_vec_2)
    return sm_df, needed_vecs

 

def get_0_1(unlabeled_df, needed_vecs):
    df_01 = get_lda_submission(unlabeled_df, needed_vecs[0])
    sm_df = df_01[df_01['lda_predict'] == 0].copy()
    df_01_b = get_lda_submission(sm_df, needed_vecs[1])
    return df_01_b

def get_3_4(unlabeled_df, needed_vecs):
    df_01 = get_lda_submission(unlabeled_df, needed_vecs[2])
    sm_df = df_01[df_01['lda_predict'] == 1].copy()
    df_01_b = get_lda_submission(sm_df, needed_vecs[3])   
    return df_01_b

with_negs, needed_vecs = get_negatives(df)
with_pos, needed_vecs = get_positives(df)

unlabeled_df = df_sm.copy()
test = get_0_1(unlabeled_df, needed_vecs)

len(test)

1202

unlabeled_df = df_sm.copy()
test = get_3_4(unlabeled_df, needed_vecs)
test

from collections import Counter
Counter(with_negs['lda_predict'].values)

Counter({0: 1202})

Counter(with_negs['Sentiment'].values)

Counter({2: 305, 0: 179, 3: 208, 1: 430, 4: 80})

len(with_negs)

1202

test=pd.read_csv("../WK7/kaggle-sentiment/test.tsv", delimiter='\t')
test.to_csv('kaggle_csv_test.csv')

df0 = get_lda_submission(unlabeled_df, needed_vecs[0])
df1 = get_lda_submission(unlabeled_df, needed_vecs[1])
df1

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-113-bdea7e048706> in <module>
      1 df0 = get_lda_submission(unlabeled_df, needed_vecs[0])
----> 2 df1 = get_lda_submission(unlabeled_df, needed_vecs[1])
      3 df1

<ipython-input-89-d4dc86660ab9> in get_lda_submission(df, negativeness_score)
      8 def get_lda_submission(df, negativeness_score):
      9     tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
---> 10     df['lda_score'] = MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))
     11     df['lda_predict'] = (df.lda_score > .5).astype(int)
     12     return df

~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
   3470         else:
   3471             # set column
-> 3472             self._set_item(key, value)
   3473 
   3474     def _setitem_slice(self, key, value):

~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _set_item(self, key, value)
   3547 
   3548         self._ensure_valid_index(value)
-> 3549         value = self._sanitize_column(key, value)
   3550         NDFrame._set_item(self, key, value)
   3551 

~/anaconda3/lib/python3.7/site-packages/pandas/core/frame.py in _sanitize_column(self, key, value, broadcast)
   3732 
   3733             # turn me into an ndarray
-> 3734             value = sanitize_index(value, self.index, copy=False)
   3735             if not isinstance(value, (np.ndarray, Index)):
   3736                 if isinstance(value, list) and len(value) > 0:

~/anaconda3/lib/python3.7/site-packages/pandas/core/internals/construction.py in sanitize_index(data, index, copy)
    610 
    611     if len(data) != len(index):
--> 612         raise ValueError("Length of values does not match length of index")
    613 
    614     if isinstance(data, ABCIndexClass) and not copy:

ValueError: Length of values does not match length of index

with_pred_label = unlabeled_df.copy()
with_pred_label['actual'] = [0 if x == 0 else 'tbd' for x in df0['lda_predict']]
with_pred_label

len(needed_vecs[3])

450

	Unnamed: 0	PhraseId	SentenceId	Phrase	Sentiment	Actual
128037	128037	128038	6887	as the main character suggests , ` what if	3	tbd
5192	5192	5193	206	well-wrought story	4	tbd
50057	50057	50058	2457	pack raw dough	2	tbd
109259	109259	109260	5785	into the editing room	2	tbd
73349	73349	73350	3748	concerned with morality	2	tbd
...	...	...	...	...	...	...
25979	25979	25980	1189	Spy	2	tbd
28724	28724	28725	1331	semi-autobiographical film	2	tbd
5064	5064	5065	198	that writer and director Burr Steers knows the...	3	tbd
85856	85856	85857	4443	associations you choose to make	2	tbd
141693	141693	141694	7686	a human volcano or	2	tbd

	Unnamed: 0	PhraseId	SentenceId	Phrase	Sentiment	lda_score	lda_predict
127609	127609	127610	6864	one that is dark , disturbing , painful to wat...	3	0.529864	1
112393	112393	112394	5969	a satisfying summer blockbuster and worth a look	3	0.741489	1
81788	81788	81789	4220	And how .	2	0.554698	1
38122	38122	38123	1813	reminds you of why animation is such a perfect...	4	0.586279	1
38553	38553	38554	1838	is to catch the pitch of his poetics , savor t...	3	0.472868	0
...	...	...	...	...	...	...	...
124761	124761	124762	6705	Both Garcia and Jagger turn in perfectly execu...	3	0.743028	1
138913	138913	138914	7529	has all the enjoyable randomness of a very liv...	4	0.691226	1
146161	146161	146162	7948	great scares and a good surprise ending	3	0.795357	1
35603	35603	35604	1678	to the core of what it actually means to face ...	3	0.579167	1
116843	116843	116844	6234	It is life affirming and heartbreaking , sweet...	3	0.647515	1

	Unnamed: 0	PhraseId	SentenceId	Phrase	Sentiment	lda_score	lda_predict	actual
128037	128037	128038	6887	as the main character suggests , ` what if	3	0.625832	1	tbd
5192	5192	5193	206	well-wrought story	4	0.870760	1	tbd
50057	50057	50058	2457	pack raw dough	2	0.858039	1	tbd
109259	109259	109260	5785	into the editing room	2	0.769221	1	tbd
73349	73349	73350	3748	concerned with morality	2	0.863717	1	tbd
...	...	...	...	...	...	...	...	...
25979	25979	25980	1189	Spy	2	0.882761	1	tbd
28724	28724	28725	1331	semi-autobiographical film	2	0.873131	1	tbd
5064	5064	5065	198	that writer and director Burr Steers knows the...	3	0.803085	1	tbd
85856	85856	85857	4443	associations you choose to make	2	0.738841	1	tbd
141693	141693	141694	7686	a human volcano or	2	0.762257	1	tbd