HW8 methods on HW7 assignment¶

MOTIVATION¶

In our endless quest to get a better accuracy for the kaggle competition, we're going to try applying LDA to the dataset (just for funsies!).

METHODS OVERVIEW¶

Because LDA relies on booleans, we're first going to separate the data into 0 not0, 1, not1, 2, not2 etc.

## =======================================================
## IMPORTING
## =======================================================
import pandas as pd
train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
train.to_csv('kaggle_csv.csv')

## =======================================================
## LDA
## =======================================================
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler

def get_lda(df):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df.PoN.astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    return (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3), df, negativity_score, n_centroid, p_centroid

def get_centroids(df):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df.PoN.astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    return n_centroid, p_centroid, negativity_score
    
def get_lda_submission(df, n_centroid, p_centroid):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    return df

TEST 1: Splitting the data into NEG (0,1) and POS (2,3,4)¶

df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']

df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df, neg_score, n_cent, p_cent = get_lda(df)
print(lda_score, neg_score)
print(n_cent)
print(p_cent)
len(n_cent)

0.78 [-7.65301422e-03 -3.94205354e-03 -2.55865579e-03 ...  1.39818841e-05
  9.88668503e-06  9.88668503e-06]
[7.10273499e-04 7.44736360e-05 1.72212536e-04 ... 4.64599879e-05
 4.42011525e-05 0.00000000e+00]
[4.17019741e-04 1.99202480e-05 3.04331371e-04 ... 0.00000000e+00
 0.00000000e+00 1.11766109e-04]

16394

len(n_cent)
len(p_cent)
len(df)

156060

TEST 2: Splitting the data into NEG (0,1,2) and POS (3,4)¶

df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']

df['PoN'] = [0 if x < 3 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df = get_lda(df)
print(lda_score)

0.736

TEST 3: Splitting the data into NEG(0,1) and POS (3,4) (Removing Neutral)¶

df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']

df = df.drop(df[df['S0'] == 2].index)
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df = get_lda(df)
print(lda_score)

0.649

TEST 4A: Taking the "negatives" from Test 3 and running LDA¶

df_neg = df[df['lda_predict'] == 0].copy()
df_neg

df_neg['PoN'] = [0 if x == 0 else 1 for x in df_neg['S0']]
lda_score, n_df = get_lda(df_neg)
print(lda_score)

0.807

df

TEST 4B: Taking the "positives" from Test 3 and running LDA¶

df_pos = df[df['lda_predict'] == 1].copy()
df_pos['PoN'] = [0 if x == 3 else 1 for x in df_pos['S0']]
df_pos
lda_score, p_df = get_lda(df_pos)
print(lda_score)

0.618

TEST 5: Checking to see if I need labels bc I'm second guessing myself here¶

df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]

FIRST SPLIT -- trying to get the negatives¶

neg_n_centroid, neg_p_centroid = get_centroids(df)
score, df = get_lda(df)
print(score)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-24-7702aba4e2aa> in <module>
----> 1 neg_n_centroid, neg_p_centroid = get_centroids(df)
      2 score, df = get_lda(df)
      3 print(score)

ValueError: too many values to unpack (expected 2)

submission_df = get_lda_submission(df, n_centroid, p_centroid)

submission_df['round_1'] = ['pos' if x < 1 else 'neg' for x in submission_df['lda_predict']]
df_neg = submission_df[submission_df['lda_predict'] == 0].copy()
df_neg['PoN'] = [0 if x == 0 else 1 for x in df_neg['S0']]
lda_score, n_df = get_lda(df_neg)
print(lda_score)

df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 3 else 1 for x in df['S0']]

pos_n_centroid, pos_p_centroid = get_centroids(df)
score, df = get_lda(df)
print(score)

df_pos = df[df['lda_predict'] == 1].copy()
df_pos_true = df_pos[df_pos['PoN'] == df_pos['lda_predict']]
df_pos_true

len(df_pos_true[df_pos_true['S0'] == 3])

len(df_pos_true[df_pos_true['S0'] == 4])

len(df_pos_true)

df = pd.DataFrame()
df['Phrase'] = df_pos_true['Phrase']
df['S0'] = df_pos_true['S0']
df['PoN'] = [0 if x == 3 else 1 for x in df['S0']]

pos2_n_centroid, pos2_p_centroid = get_centroids(df)
score, df = get_lda(df)
print(score)

df = get_lda_submission(df, pos2_n_centroid, pos2_p_centroid)
print(df)

df['actual_score'] = [3 if x == 0 else 4 for x in df['lda_predict']]
df

from collections import Counter
Counter(train['Sentiment'].values)

train_test = train.drop('Sentiment',axis=1)
train_test

train

submission_df = get_lda_submission(train_test, n_centroid, p_centroid)

submission_df['COMPARE'] = train['Sentiment']
submission_df
df_neg = submission_df[submission_df['lda_predict'] == 0].copy()
submission_df_2 = get_lda_submission(df_neg, neg_n_centroid, neg_p_centroid)

# submission_df_2
df_neg

df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]

n_cent_1, p_cent_1, neg_score = get_centroids(df)

neg_score

train_test = train.drop('Sentiment',axis=1)
submission_df = get_lda_submission(train_test, n_cent_1, p_cent_1)

submission_df['lda_predict'].values.min()

train_test['lda_predict'].values.max()

# train_test_neg = train_test[train_test['lda_predict'] == 0].copy()

train_test

	Phrase	S0	PoN	lda_score	lda_predict
94	, I suspect , would have a hard time sitting t...	1	0	0.497936	0
100	would have a hard time sitting through this one .	1	0	0.447842	0
101	would have a hard time sitting through this one	0	0	0.460341	0
103	have a hard time sitting through this one	0	0	0.488530	0
110	sitting through this one	1	0	0.495408	0
...	...	...	...	...	...
156031	The movie 's downfall is to substitute plot fo...	1	0	0.413219	0
156032	The movie 's downfall	1	0	0.496717	0
156033	is to substitute plot for personality .	1	0	0.424279	0
156034	is to substitute plot for personality	1	0	0.437314	0
156036	substitute plot for personality	1	0	0.496285	0

	Phrase	S0	PoN	lda_score	lda_predict
0	A series of escapades demonstrating the adage ...	1	0	0.628765	1
21	good for the goose	3	1	0.668112	1
22	good	3	1	0.837784	1
33	the gander , some of which occasionally amuses...	1	0	0.572662	1
46	amuses	3	1	0.562098	1
...	...	...	...	...	...
156047	quietly suggesting the sadness and obsession b...	1	0	0.586494	1
156051	sadness and obsession	1	0	0.639210	1
156052	sadness and	1	0	0.677912	1
156056	forced avuncular chortles	1	0	0.547883	1
156057	avuncular chortles	3	1	0.561126	1