In our endless quest to get a better accuracy for the kaggle competition, we're going to try applying LDA to the dataset (just for funsies!).
Because LDA relies on booleans, we're first going to separate the data into 0 not0, 1, not1, 2, not2 etc.
## =======================================================
## IMPORTING
## =======================================================
import pandas as pd
train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
train.to_csv('kaggle_csv.csv')
## =======================================================
## LDA
## =======================================================
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler
def get_lda(df):
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
mask = df.PoN.astype(bool).values
n_centroid = tfidf_docs[mask].mean(axis=0)
p_centroid = tfidf_docs[~mask].mean(axis=0)
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
df['lda_predict'] = (df.lda_score > .5).astype(int)
return (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3), df, negativity_score, n_centroid, p_centroid
def get_centroids(df):
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
mask = df.PoN.astype(bool).values
n_centroid = tfidf_docs[mask].mean(axis=0)
p_centroid = tfidf_docs[~mask].mean(axis=0)
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
return n_centroid, p_centroid, negativity_score
def get_lda_submission(df, n_centroid, p_centroid):
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
df['lda_predict'] = (df.lda_score > .5).astype(int)
return df
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df, neg_score, n_cent, p_cent = get_lda(df)
print(lda_score, neg_score)
print(n_cent)
print(p_cent)
len(n_cent)
len(n_cent)
len(p_cent)
len(df)
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 3 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df = get_lda(df)
print(lda_score)
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df = df.drop(df[df['S0'] == 2].index)
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df = get_lda(df)
print(lda_score)
df_neg = df[df['lda_predict'] == 0].copy()
df_neg
df_neg['PoN'] = [0 if x == 0 else 1 for x in df_neg['S0']]
lda_score, n_df = get_lda(df_neg)
print(lda_score)
df
df_pos = df[df['lda_predict'] == 1].copy()
df_pos['PoN'] = [0 if x == 3 else 1 for x in df_pos['S0']]
df_pos
lda_score, p_df = get_lda(df_pos)
print(lda_score)
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
neg_n_centroid, neg_p_centroid = get_centroids(df)
score, df = get_lda(df)
print(score)
submission_df = get_lda_submission(df, n_centroid, p_centroid)
submission_df['round_1'] = ['pos' if x < 1 else 'neg' for x in submission_df['lda_predict']]
df_neg = submission_df[submission_df['lda_predict'] == 0].copy()
df_neg['PoN'] = [0 if x == 0 else 1 for x in df_neg['S0']]
lda_score, n_df = get_lda(df_neg)
print(lda_score)
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 3 else 1 for x in df['S0']]
pos_n_centroid, pos_p_centroid = get_centroids(df)
score, df = get_lda(df)
print(score)
df_pos = df[df['lda_predict'] == 1].copy()
df_pos_true = df_pos[df_pos['PoN'] == df_pos['lda_predict']]
df_pos_true
len(df_pos_true[df_pos_true['S0'] == 3])
len(df_pos_true[df_pos_true['S0'] == 4])
len(df_pos_true)
df = pd.DataFrame()
df['Phrase'] = df_pos_true['Phrase']
df['S0'] = df_pos_true['S0']
df['PoN'] = [0 if x == 3 else 1 for x in df['S0']]
pos2_n_centroid, pos2_p_centroid = get_centroids(df)
score, df = get_lda(df)
print(score)
df = get_lda_submission(df, pos2_n_centroid, pos2_p_centroid)
print(df)
df['actual_score'] = [3 if x == 0 else 4 for x in df['lda_predict']]
df
from collections import Counter
Counter(train['Sentiment'].values)
train_test = train.drop('Sentiment',axis=1)
train_test
train
submission_df = get_lda_submission(train_test, n_centroid, p_centroid)
submission_df['COMPARE'] = train['Sentiment']
submission_df
df_neg = submission_df[submission_df['lda_predict'] == 0].copy()
submission_df_2 = get_lda_submission(df_neg, neg_n_centroid, neg_p_centroid)
# submission_df_2
df_neg
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
n_cent_1, p_cent_1, neg_score = get_centroids(df)
neg_score
train_test = train.drop('Sentiment',axis=1)
submission_df = get_lda_submission(train_test, n_cent_1, p_cent_1)
submission_df['lda_predict'].values.min()
train_test['lda_predict'].values.max()
# train_test_neg = train_test[train_test['lda_predict'] == 0].copy()
train_test