In [ ]:
 
In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler
In [20]:
import pandas as pd
og_df = pd.read_csv('kaggle_csv.csv')
df_sm = og_df.copy()
df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)
df = df_sm.copy()
len(df_sm)
Out[20]:
15606
In [16]:
def get_dividing_vec(df, num):
    df['PoN'] = [0 if x < num else 1 for x in df['Sentiment']]
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df['PoN'].astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    return negativity_score
In [17]:
negativeness_score_01 = get_dividing_vec(df, 2) # splits into 01, 234
negativeness_score_34 = get_dividing_vec(df, 3) # splits into 012, 34
In [22]:
# def get_lda_score(negativeness_score):
#     return MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))
# df['lda_score'] = df.apply(lambda x: get_lda_score(x['Phrase']), axis=1)
# df['lda_score'] = df.apply(lambda x: get_lda_score(negativeness_score), axis=1)
# df['lda_predict'] = (df.lda_score > .5).astype(int)
df['lda_score_01'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))
df['lda_predict_01'] = df.apply(lambda x: (x['lda_score_01'] >.5), axis=1)

df['lda_score_34'] = MinMaxScaler().fit_transform(negativeness_score_34.reshape(-1,1))
df['lda_predict_34'] = df.apply(lambda x: (x['lda_score_34'] >.5), axis=1)
In [23]:
def calculate_actual(row):
    if (row['lda_predict_01'] == False) and (row['lda_predict_34'] == False):
        return 0
    elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):
        return 1
    elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):
        return 3
df['actual'] = df.apply
Out[23]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment lda_score_01 lda_predict_01 lda_score_34 lda_predict_34
128037 128037 128038 6887 as the main character suggests , ` what if 3 0.625832 True 0.261945 False
5192 5192 5193 206 well-wrought story 4 0.870760 True 0.209028 False
50057 50057 50058 2457 pack raw dough 2 0.858039 True 0.173733 False
109259 109259 109260 5785 into the editing room 2 0.769221 True 0.192198 False
73349 73349 73350 3748 concerned with morality 2 0.863717 True 0.231941 False
... ... ... ... ... ... ... ... ... ...
25979 25979 25980 1189 Spy 2 0.882761 True 0.166748 False
28724 28724 28725 1331 semi-autobiographical film 2 0.873131 True 0.233286 False
5064 5064 5065 198 that writer and director Burr Steers knows the... 3 0.803085 True 0.332284 False
85856 85856 85857 4443 associations you choose to make 2 0.738841 True 0.194295 False
141693 141693 141694 7686 a human volcano or 2 0.762257 True 0.265051 False

15606 rows × 9 columns

In [45]:
def get_dividing_vec_og(df, num):
    df['PoN'] = [0 if x < num else 1 for x in df['Sentiment']]
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df['PoN'].astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    return negativity_score

def get_dividing_vec(df, num):
    df['PoN'] = [0 if x == num else 1 for x in df['Sentiment']]
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df['PoN'].astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    return negativity_score
In [46]:
negativeness_score_0 = get_dividing_vec(df, 0) # splits into 0, 1234
negativeness_score_1 = get_dividing_vec(df, 1) # splits into 1, 0234
negativeness_score_2 = get_dividing_vec(df, 2) # splits into 2, 0134
negativeness_score_3 = get_dividing_vec(df, 3) # splits into 3, 0124
negativeness_score_4 = get_dividing_vec(df, 4) # splits into 4, 0123
In [47]:
df['lda_score_0'] = MinMaxScaler().fit_transform(negativeness_score_0.reshape(-1,1))
df['lda_predict_0'] = df.apply(lambda x: (x['lda_score_0'] >.5), axis=1)

df['lda_score_1'] = MinMaxScaler().fit_transform(negativeness_score_1.reshape(-1,1))
df['lda_predict_1'] = df.apply(lambda x: (x['lda_score_1'] >.5), axis=1)

df['lda_score_2'] = MinMaxScaler().fit_transform(negativeness_score_2.reshape(-1,1))
df['lda_predict_2'] = df.apply(lambda x: (x['lda_score_2'] >.5), axis=1)

df['lda_score_3'] = MinMaxScaler().fit_transform(negativeness_score_3.reshape(-1,1))
df['lda_predict_3'] = df.apply(lambda x: (x['lda_score_3'] >.5), axis=1)

df['lda_score_4'] = MinMaxScaler().fit_transform(negativeness_score_4.reshape(-1,1))
df['lda_predict_4'] = df.apply(lambda x: (x['lda_score_4'] >.5), axis=1)
In [48]:
negativeness_score_01 = get_dividing_vec_og(df, 2) # splits into 4, 0123
negativeness_score_34 = get_dividing_vec_og(df, 3) # splits into 4, 0123
df['lda_score_01'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))
df['lda_predict_01'] = df.apply(lambda x: (x['lda_score_01'] >.5), axis=1)

df['lda_score_34'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))
df['lda_predict_34'] = df.apply(lambda x: (x['lda_score_34'] >.5), axis=1)
In [65]:
from collections import Counter
neg_df = df[df['lda_predict_01'] == False]
# print(len(neg_df))
# Counter(neg_df['Sentiment'])
neg_df
negativeness_score_01_0 = get_dividing_vec_og(neg_df.copy(), 1) # splits into 4, 0123
neg_df['lda_score_01_0'] = MinMaxScaler().fit_transform(negativeness_score_01_0.reshape(-1,1))
neg_df['lda_predict_01_0'] = neg_df.apply(lambda x: (x['lda_score_01_0'] >.5), axis=1)
# neg_df
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:7: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/ipykernel_launcher.py:8: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
In [70]:
columns = ['Sentiment','lda_predict_01_0']
view_df = pd.DataFrame(neg_df, columns = columns)
view_df
Counter(neg_df['lda_predict_01_0'])
Counter(neg_df['Sentiment'])
Out[70]:
Counter({2: 305, 0: 179, 3: 208, 1: 430, 4: 80})
In [50]:
columns = ['Sentiment','lda_predict_01','lda_predict_34','lda_predict_0', 'lda_predict_1','lda_predict_2','lda_predict_3','lda_predict_4']
view_df = pd.DataFrame(df, columns = columns)
In [51]:
view_df[view_df['Sentiment'] == 4]
Out[51]:
Sentiment lda_predict_01 lda_predict_34 lda_predict_0 lda_predict_1 lda_predict_2 lda_predict_3 lda_predict_4
5192 4 True True True True False True True
99335 4 True True True True False True True
144855 4 True True True True False True True
82075 4 False False False False False True True
82568 4 True True True True False True True
... ... ... ... ... ... ... ... ...
86351 4 True True True True False False True
28644 4 True True True True False True True
28410 4 True True True True False True True
20683 4 True True True True False True True
146190 4 True True True True False True True

985 rows × 8 columns

In [52]:
view_df[view_df['Sentiment'] == 3]
Out[52]:
Sentiment lda_predict_01 lda_predict_34 lda_predict_0 lda_predict_1 lda_predict_2 lda_predict_3 lda_predict_4
128037 3 True True True True False True True
21758 3 True True True True False True True
51668 3 True True True True False True True
19946 3 True True True True False True True
103068 3 True True True True False True True
... ... ... ... ... ... ... ... ...
115347 3 True True True True False False True
139003 3 True True True True False True True
116843 3 False False False True True False False
87571 3 True True True True False True True
5064 3 True True True True False True True

3239 rows × 8 columns

In [71]:
df
Out[71]:
Unnamed: 0 PhraseId SentenceId Phrase Sentiment lda_score_01 lda_predict_01 lda_score_34 lda_predict_34 PoN lda_score_0 lda_predict_0 lda_score_1 lda_predict_1 lda_score_2 lda_predict_2 lda_score_3 lda_predict_3 lda_score_4 lda_predict_4
128037 128037 128038 6887 as the main character suggests , ` what if 3 0.625832 True 0.625832 True 1 0.682450 True 0.588361 True 0.230660 False 0.658924 True 0.825389 True
5192 5192 5193 206 well-wrought story 4 0.870760 True 0.870760 True 1 0.950018 True 0.807329 True 0.074725 False 0.743194 True 0.833822 True
50057 50057 50058 2457 pack raw dough 2 0.858039 True 0.858039 True 0 0.859539 True 0.848161 True 0.052835 False 0.760960 True 0.892131 True
109259 109259 109260 5785 into the editing room 2 0.769221 True 0.769221 True 0 0.865337 True 0.698661 True 0.108810 False 0.738192 True 0.880093 True
73349 73349 73350 3748 concerned with morality 2 0.863717 True 0.863717 True 0 0.925024 True 0.812825 True 0.096071 False 0.697099 True 0.843338 True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
25979 25979 25980 1189 Spy 2 0.882761 True 0.882761 True 0 0.925578 True 0.843655 True 0.035800 False 0.783875 True 0.877085 True
28724 28724 28725 1331 semi-autobiographical film 2 0.873131 True 0.873131 True 0 0.898261 True 0.846496 True 0.092741 False 0.725815 True 0.800833 True
5064 5064 5065 198 that writer and director Burr Steers knows the... 3 0.803085 True 0.803085 True 1 0.774411 True 0.816138 True 0.203446 False 0.607434 True 0.731231 True
85856 85856 85857 4443 associations you choose to make 2 0.738841 True 0.738841 True 0 0.797750 True 0.694952 True 0.124631 False 0.731624 True 0.884184 True
141693 141693 141694 7686 a human volcano or 2 0.762257 True 0.762257 True 0 0.790570 True 0.738217 True 0.169487 False 0.672788 True 0.799116 True

15606 rows × 20 columns

In [74]:
columns = ['Sentiment','lda_score_0', 'lda_score_1','lda_score_2','lda_score_3','lda_score_4']
view_df = pd.DataFrame(df, columns = columns)
In [75]:
view_df['avg_0'] = 
Out[75]:
Sentiment lda_score_0 lda_score_1 lda_score_2 lda_score_3 lda_score_4
128037 3 0.682450 0.588361 0.230660 0.658924 0.825389
5192 4 0.950018 0.807329 0.074725 0.743194 0.833822
50057 2 0.859539 0.848161 0.052835 0.760960 0.892131
109259 2 0.865337 0.698661 0.108810 0.738192 0.880093
73349 2 0.925024 0.812825 0.096071 0.697099 0.843338
... ... ... ... ... ... ...
25979 2 0.925578 0.843655 0.035800 0.783875 0.877085
28724 2 0.898261 0.846496 0.092741 0.725815 0.800833
5064 3 0.774411 0.816138 0.203446 0.607434 0.731231
85856 2 0.797750 0.694952 0.124631 0.731624 0.884184
141693 2 0.790570 0.738217 0.169487 0.672788 0.799116

15606 rows × 6 columns

In [82]:
print(view_df['lda_score_0'].sum()/len(view_df))
print(view_df['lda_score_1'].sum()/len(view_df))
print(view_df['lda_score_2'].sum()/len(view_df))
print(view_df['lda_score_3'].sum()/len(view_df))
print(view_df['lda_score_4'].sum()/len(view_df))
0.769356026955543
0.701779683155207
0.18353138059679489
0.6827725447936251
0.786741026622421
In [83]:
view_df[view_df['Sentiment'] == 0 ]
Out[83]:
Sentiment lda_score_0 lda_score_1 lda_score_2 lda_score_3 lda_score_4
33878 0 0.464875 0.418104 0.410717 0.582924 0.663965
76568 0 0.514842 0.392097 0.398716 0.586173 0.688356
95529 0 0.657040 0.685026 0.281238 0.607644 0.678328
1046 0 0.851553 0.777606 0.065881 0.776598 0.896196
101732 0 0.618032 0.793915 0.086736 0.810066 0.909167
... ... ... ... ... ... ...
106560 0 0.744318 0.622385 0.192315 0.694039 0.826760
33469 0 0.460790 0.549610 0.378017 0.611169 0.613575
149358 0 0.653788 0.584912 0.265159 0.658602 0.742774
25024 0 0.610724 0.636797 0.272931 0.638486 0.728207
76265 0 0.511089 0.548240 0.278886 0.701541 0.756394

691 rows × 6 columns

In [85]:
# def split_into_neg_pos(df):
#     # USE CENTEROID 
In [86]:
def get_lda(df):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df.PoN.astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    return (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3), df, negativity_score, n_centroid, p_centroid
In [87]:
import pandas as pd
train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
train.to_csv('kaggle_csv.csv')

df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']

df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df, neg_score, n_cent, p_cent = get_lda(df)
In [92]:
def get_lda_submission(df, n_centroid, p_centroid):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    return df

columns = ['Phrase']
new_df =  pd.DataFrame(df, columns = columns)
# new_df
predicted_df = get_lda_submission(new_df, n_cent, p_cent)
In [97]:
predicted_df['actual'] = df['S0']
Counter(predicted_df['lda_predict'] )
Out[97]:
Counter({1: 152309, 0: 3751})
In [101]:
predicted_df[(predicted_df.actual == 0) & (predicted_df.lda_predict == 0)].count()
Out[101]:
Phrase         670
lda_score      670
lda_predict    670
actual         670
dtype: int64
In [102]:
predicted_df[(predicted_df.actual == 1) & (predicted_df.lda_predict == 0)].count()
Out[102]:
Phrase         1218
lda_score      1218
lda_predict    1218
actual         1218
dtype: int64
In [103]:
predicted_df[(predicted_df.actual > 1) & (predicted_df.lda_predict == 0)].count()
Out[103]:
Phrase         1863
lda_score      1863
lda_predict    1863
actual         1863
dtype: int64
In [106]:
predicted_df['bool'] = [0 if x < 2 else 1 for x in predicted_df['actual']]
In [108]:
predicted_df['check'] = predicted_df.apply(lambda x: (x['lda_predict'] == x['bool']), axis=1)
In [109]:
predicted_df
Out[109]:
Phrase lda_score lda_predict actual bool check
0 A series of escapades demonstrating the adage ... 0.531051 1 1 0 False
1 A series of escapades demonstrating the adage ... 0.712447 1 2 1 True
2 A series 0.780069 1 2 1 True
3 A 0.577151 1 2 1 True
4 series 0.893099 1 2 1 True
... ... ... ... ... ... ...
156055 Hearst 's 0.858866 1 2 1 True
156056 forced avuncular chortles 0.891445 1 1 0 False
156057 avuncular chortles 0.905823 1 3 1 True
156058 avuncular 0.905623 1 2 1 True
156059 chortles 0.905623 1 2 1 True

156060 rows × 6 columns

In [110]:
Counter(predicted_df['check'])
Out[110]:
Counter({False: 34320, True: 121740})
In [111]:
len(df)
Out[111]:
156060
In [112]:
121740/len(df)
Out[112]:
0.780084582852749
In [ ]: