In [ ]:
 
In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler
In [20]:
import pandas as pd
og_df = pd.read_csv('kaggle_csv.csv')
df_sm = og_df.copy()
df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)
df = df_sm.copy()
len(df_sm)
Out[20]:
15606
In [16]:
def get_dividing_vec(df, num):
    df['PoN'] = [0 if x < num else 1 for x in df['Sentiment']]
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df['PoN'].astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    return negativity_score
In [17]:
negativeness_score_01 = get_dividing_vec(df, 2) # splits into 01, 234
negativeness_score_34 = get_dividing_vec(df, 3) # splits into 012, 34
In [22]:
# def get_lda_score(negativeness_score):
#     return MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))
# df['lda_score'] = df.apply(lambda x: get_lda_score(x['Phrase']), axis=1)
# df['lda_score'] = df.apply(lambda x: get_lda_score(negativeness_score), axis=1)
# df['lda_predict'] = (df.lda_score > .5).astype(int)
df['lda_score_01'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))
df['lda_predict_01'] = df.apply(lambda x: (x['lda_score_01'] >.5), axis=1)

df['lda_score_34'] = MinMaxScaler().fit_transform(negativeness_score_34.reshape(-1,1))
df['lda_predict_34'] = df.apply(lambda x: (x['lda_score_34'] >.5), axis=1)
In [23]:
def calculate_actual(row):
    if (row['lda_predict_01'] == False) and (row['lda_predict_34'] == False):
        return 0
    elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):
        return 1
    elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):
        return 3
df['actual'] = df.apply
Out[23]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment lda_score_01 lda_predict_01 lda_score_34 lda_predict_34
128037 128037 128038 6887 as the main character suggests , ` what if 3 0.625832 True 0.261945 False
5192 5192 5193 206 well-wrought story 4 0.870760 True 0.209028 False
50057 50057 50058 2457 pack raw dough 2 0.858039 True 0.173733 False
109259 109259 109260 5785 into the editing room 2 0.769221 True 0.192198 False
73349 73349 73350 3748 concerned with morality 2 0.863717 True 0.231941 False
... ... ... ... ... ... ... ... ... ...
25979 25979 25980 1189 Spy 2 0.882761 True 0.166748 False
28724 28724 28725 1331 semi-autobiographical film 2 0.873131 True 0.233286 False
5064 5064 5065 198 that writer and director Burr Steers knows the... 3 0.803085 True 0.332284 False
85856 85856 85857 4443 associations you choose to make 2 0.738841 True 0.194295 False
141693 141693 141694 7686 a human volcano or 2 0.762257 True 0.265051 False

15606 rows × 9 columns

In [ ]: