from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
og_df = pd.read_csv('kaggle_csv.csv')
df_sm = og_df.copy()
df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)
len(df_sm)
# len(og_df)
df = df_sm.copy()
df['Actual'] = 'tbd'
df
# INPUT OG DF
# OUTPUT 0,1 df and 3,4 df
# def get_small_df():
# STEP 1: GET CENTEROIDS (needed_vecs) FROM LABELED
# STEP 2: APPLY CENTEROIDS (needed_vecs) TO UNLABELED
def get_lda_submission(df, negativeness_score):
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
df['lda_score'] = MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))
df['lda_predict'] = (df.lda_score > .5).astype(int)
return df
needed_vecs = []
def get_dividing_vec(df, PoN):
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
mask = df['PoN'].astype(bool).values
n_centroid = tfidf_docs[mask].mean(axis=0)
p_centroid = tfidf_docs[~mask].mean(axis=0)
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
return negativity_score
def get_negatives(df):
df['PoN'] = [0 if x < 2 else 1 for x in df['Sentiment']]
# SPLIT ONE (into NEG)
dividing_vec_1 = get_dividing_vec(df, 'PoN')
needed_vecs.append(dividing_vec_1)
df_01 = get_lda_submission(df, dividing_vec_1)
sm_df = df_01[df_01['lda_predict'] == 0].copy()
# SPLIT TWO (into 0 and 1)
sm_df['PoN2'] = [0 if x < 1 else 1 for x in sm_df['Sentiment']]
dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')
needed_vecs.append(dividing_vec_2)
df_02 = get_lda_submission(sm_df, dividing_vec_2)
# = get_dividing_vec(df_01, 'PoN2')
# print(needed_vecs)
# split big df
# take 0
# split again
# PRINT!
# print(df)
return sm_df, needed_vecs
def get_positives(df):
df['PoN'] = [0 if x < 3 else 1 for x in df['Sentiment']]
# SPLIT ONE (into NEG)
dividing_vec_1 = get_dividing_vec(df, 'PoN')
needed_vecs.append(dividing_vec_1)
df_01 = get_lda_submission(df, dividing_vec_1)
sm_df = df_01[df_01['lda_predict'] == 1].copy()
# SPLIT TWO (into 0 and 1)
sm_df['PoN2'] = [1 if x == 4 else 0 for x in sm_df['Sentiment']]
dividing_vec_2 = get_dividing_vec(sm_df, 'PoN2')
needed_vecs.append(dividing_vec_2)
df_02 = get_lda_submission(sm_df, dividing_vec_2)
return sm_df, needed_vecs
def get_0_1(unlabeled_df, needed_vecs):
df_01 = get_lda_submission(unlabeled_df, needed_vecs[0])
sm_df = df_01[df_01['lda_predict'] == 0].copy()
df_01_b = get_lda_submission(sm_df, needed_vecs[1])
return df_01_b
def get_3_4(unlabeled_df, needed_vecs):
df_01 = get_lda_submission(unlabeled_df, needed_vecs[2])
sm_df = df_01[df_01['lda_predict'] == 1].copy()
df_01_b = get_lda_submission(sm_df, needed_vecs[3])
return df_01_b
with_negs, needed_vecs = get_negatives(df)
with_pos, needed_vecs = get_positives(df)
unlabeled_df = df_sm.copy()
test = get_0_1(unlabeled_df, needed_vecs)
len(test)
unlabeled_df = df_sm.copy()
test = get_3_4(unlabeled_df, needed_vecs)
test
from collections import Counter
Counter(with_negs['lda_predict'].values)
Counter(with_negs['Sentiment'].values)
len(with_negs)
test=pd.read_csv("../WK7/kaggle-sentiment/test.tsv", delimiter='\t')
test.to_csv('kaggle_csv_test.csv')
df0 = get_lda_submission(unlabeled_df, needed_vecs[0])
df1 = get_lda_submission(unlabeled_df, needed_vecs[1])
df1
with_pred_label = unlabeled_df.copy()
with_pred_label['actual'] = [0 if x == 0 else 'tbd' for x in df0['lda_predict']]
with_pred_label
len(needed_vecs[3])