HW8 methods on HW7 assignment

MOTIVATION

In our endless quest to get a better accuracy for the kaggle competition, we're going to try applying LDA to the dataset (just for funsies!). 

METHODS OVERVIEW

Because LDA relies on booleans, we're first going to separate the data into 0 not0, 1, not1, 2, not2 etc.
In [30]:
## =======================================================
## IMPORTING
## =======================================================
import pandas as pd
train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
train.to_csv('kaggle_csv.csv')

## =======================================================
## LDA
## =======================================================
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler

def get_lda(df):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df.PoN.astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    return (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3), df, negativity_score, n_centroid, p_centroid

def get_centroids(df):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df.PoN.astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    return n_centroid, p_centroid, negativity_score
    
def get_lda_submission(df, n_centroid, p_centroid):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    return df

TEST 1: Splitting the data into NEG (0,1) and POS (2,3,4)

In [32]:
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']

df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df, neg_score, n_cent, p_cent = get_lda(df)
print(lda_score, neg_score)
print(n_cent)
print(p_cent)
len(n_cent)
0.78 [-7.65301422e-03 -3.94205354e-03 -2.55865579e-03 ...  1.39818841e-05
  9.88668503e-06  9.88668503e-06]
[7.10273499e-04 7.44736360e-05 1.72212536e-04 ... 4.64599879e-05
 4.42011525e-05 0.00000000e+00]
[4.17019741e-04 1.99202480e-05 3.04331371e-04 ... 0.00000000e+00
 0.00000000e+00 1.11766109e-04]
Out[32]:
16394
In [35]:
len(n_cent)
len(p_cent)
len(df)
Out[35]:
156060

TEST 2: Splitting the data into NEG (0,1,2) and POS (3,4)

In [17]:
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']

df['PoN'] = [0 if x < 3 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df = get_lda(df)
print(lda_score)
0.736

TEST 3: Splitting the data into NEG(0,1) and POS (3,4) (Removing Neutral)

In [18]:
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']

df = df.drop(df[df['S0'] == 2].index)
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df = get_lda(df)
print(lda_score)
0.649

TEST 4A: Taking the "negatives" from Test 3 and running LDA

In [19]:
df_neg = df[df['lda_predict'] == 0].copy()
df_neg
Out[19]:
Phrase S0 PoN lda_score lda_predict
94 , I suspect , would have a hard time sitting t... 1 0 0.497936 0
100 would have a hard time sitting through this one . 1 0 0.447842 0
101 would have a hard time sitting through this one 0 0 0.460341 0
103 have a hard time sitting through this one 0 0 0.488530 0
110 sitting through this one 1 0 0.495408 0
... ... ... ... ... ...
156031 The movie 's downfall is to substitute plot fo... 1 0 0.413219 0
156032 The movie 's downfall 1 0 0.496717 0
156033 is to substitute plot for personality . 1 0 0.424279 0
156034 is to substitute plot for personality 1 0 0.437314 0
156036 substitute plot for personality 1 0 0.496285 0

13679 rows × 5 columns

In [20]:
df_neg['PoN'] = [0 if x == 0 else 1 for x in df_neg['S0']]
lda_score, n_df = get_lda(df_neg)
print(lda_score)
0.807
In [21]:
df
Out[21]:
Phrase S0 PoN lda_score lda_predict
0 A series of escapades demonstrating the adage ... 1 0 0.628765 1
21 good for the goose 3 1 0.668112 1
22 good 3 1 0.837784 1
33 the gander , some of which occasionally amuses... 1 0 0.572662 1
46 amuses 3 1 0.562098 1
... ... ... ... ... ...
156047 quietly suggesting the sadness and obsession b... 1 0 0.586494 1
156051 sadness and obsession 1 0 0.639210 1
156052 sadness and 1 0 0.677912 1
156056 forced avuncular chortles 1 0 0.547883 1
156057 avuncular chortles 3 1 0.561126 1

76478 rows × 5 columns

TEST 4B: Taking the "positives" from Test 3 and running LDA

In [22]:
df_pos = df[df['lda_predict'] == 1].copy()
df_pos['PoN'] = [0 if x == 3 else 1 for x in df_pos['S0']]
df_pos
lda_score, p_df = get_lda(df_pos)
print(lda_score)
0.618

TEST 5: Checking to see if I need labels bc I'm second guessing myself here

In [23]:
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]

FIRST SPLIT -- trying to get the negatives

In [24]:
neg_n_centroid, neg_p_centroid = get_centroids(df)
score, df = get_lda(df)
print(score)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-7702aba4e2aa> in <module>
----> 1 neg_n_centroid, neg_p_centroid = get_centroids(df)
      2 score, df = get_lda(df)
      3 print(score)

ValueError: too many values to unpack (expected 2)
In [ ]:
 
In [ ]:
submission_df = get_lda_submission(df, n_centroid, p_centroid)
In [ ]:
submission_df['round_1'] = ['pos' if x < 1 else 'neg' for x in submission_df['lda_predict']]
df_neg = submission_df[submission_df['lda_predict'] == 0].copy()
df_neg['PoN'] = [0 if x == 0 else 1 for x in df_neg['S0']]
lda_score, n_df = get_lda(df_neg)
print(lda_score)
In [ ]:
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 3 else 1 for x in df['S0']]

pos_n_centroid, pos_p_centroid = get_centroids(df)
score, df = get_lda(df)
print(score)
In [ ]:
df_pos = df[df['lda_predict'] == 1].copy()
df_pos_true = df_pos[df_pos['PoN'] == df_pos['lda_predict']]
df_pos_true
In [ ]:
len(df_pos_true[df_pos_true['S0'] == 3])
In [ ]:
len(df_pos_true[df_pos_true['S0'] == 4])
In [ ]:
len(df_pos_true)
In [ ]:
df = pd.DataFrame()
df['Phrase'] = df_pos_true['Phrase']
df['S0'] = df_pos_true['S0']
df['PoN'] = [0 if x == 3 else 1 for x in df['S0']]

pos2_n_centroid, pos2_p_centroid = get_centroids(df)
score, df = get_lda(df)
print(score)
In [ ]:
df = get_lda_submission(df, pos2_n_centroid, pos2_p_centroid)
print(df)
In [ ]:
df['actual_score'] = [3 if x == 0 else 4 for x in df['lda_predict']]
df
In [ ]:
from collections import Counter
Counter(train['Sentiment'].values)
In [ ]:
train_test = train.drop('Sentiment',axis=1)
train_test
In [ ]:
train
In [ ]:
submission_df = get_lda_submission(train_test, n_centroid, p_centroid)
In [ ]:
submission_df['COMPARE'] = train['Sentiment']
submission_df
df_neg = submission_df[submission_df['lda_predict'] == 0].copy()
submission_df_2 = get_lda_submission(df_neg, neg_n_centroid, neg_p_centroid)
In [ ]:
# submission_df_2
df_neg
In [ ]:
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]

n_cent_1, p_cent_1, neg_score = get_centroids(df)
In [ ]:
neg_score
In [ ]:
train_test = train.drop('Sentiment',axis=1)
submission_df = get_lda_submission(train_test, n_cent_1, p_cent_1)
In [ ]:
submission_df['lda_predict'].values.min()
In [ ]:
train_test['lda_predict'].values.max()
In [ ]:
# train_test_neg = train_test[train_test['lda_predict'] == 0].copy()
In [ ]:
train_test
In [ ]:
 
In [ ]:
 
In [ ]: