from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
og_df = pd.read_csv('kaggle_csv.csv')
df_sm = og_df.copy()
df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)
df = df_sm.copy()
len(df_sm)
def get_dividing_vec(df, num):
df['PoN'] = [0 if x < num else 1 for x in df['Sentiment']]
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
mask = df['PoN'].astype(bool).values
n_centroid = tfidf_docs[mask].mean(axis=0)
p_centroid = tfidf_docs[~mask].mean(axis=0)
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
return negativity_score
negativeness_score_01 = get_dividing_vec(df, 2) # splits into 01, 234
negativeness_score_34 = get_dividing_vec(df, 3) # splits into 012, 34
# def get_lda_score(negativeness_score):
# return MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))
# df['lda_score'] = df.apply(lambda x: get_lda_score(x['Phrase']), axis=1)
# df['lda_score'] = df.apply(lambda x: get_lda_score(negativeness_score), axis=1)
# df['lda_predict'] = (df.lda_score > .5).astype(int)
df['lda_score_01'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))
df['lda_predict_01'] = df.apply(lambda x: (x['lda_score_01'] >.5), axis=1)
df['lda_score_34'] = MinMaxScaler().fit_transform(negativeness_score_34.reshape(-1,1))
df['lda_predict_34'] = df.apply(lambda x: (x['lda_score_34'] >.5), axis=1)
def calculate_actual(row):
if (row['lda_predict_01'] == False) and (row['lda_predict_34'] == False):
return 0
elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):
return 1
elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):
return 3
df['actual'] = df.apply
def get_dividing_vec_og(df, num):
df['PoN'] = [0 if x < num else 1 for x in df['Sentiment']]
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
mask = df['PoN'].astype(bool).values
n_centroid = tfidf_docs[mask].mean(axis=0)
p_centroid = tfidf_docs[~mask].mean(axis=0)
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
return negativity_score
def get_dividing_vec(df, num):
df['PoN'] = [0 if x == num else 1 for x in df['Sentiment']]
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
mask = df['PoN'].astype(bool).values
n_centroid = tfidf_docs[mask].mean(axis=0)
p_centroid = tfidf_docs[~mask].mean(axis=0)
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
return negativity_score
negativeness_score_0 = get_dividing_vec(df, 0) # splits into 0, 1234
negativeness_score_1 = get_dividing_vec(df, 1) # splits into 1, 0234
negativeness_score_2 = get_dividing_vec(df, 2) # splits into 2, 0134
negativeness_score_3 = get_dividing_vec(df, 3) # splits into 3, 0124
negativeness_score_4 = get_dividing_vec(df, 4) # splits into 4, 0123
df['lda_score_0'] = MinMaxScaler().fit_transform(negativeness_score_0.reshape(-1,1))
df['lda_predict_0'] = df.apply(lambda x: (x['lda_score_0'] >.5), axis=1)
df['lda_score_1'] = MinMaxScaler().fit_transform(negativeness_score_1.reshape(-1,1))
df['lda_predict_1'] = df.apply(lambda x: (x['lda_score_1'] >.5), axis=1)
df['lda_score_2'] = MinMaxScaler().fit_transform(negativeness_score_2.reshape(-1,1))
df['lda_predict_2'] = df.apply(lambda x: (x['lda_score_2'] >.5), axis=1)
df['lda_score_3'] = MinMaxScaler().fit_transform(negativeness_score_3.reshape(-1,1))
df['lda_predict_3'] = df.apply(lambda x: (x['lda_score_3'] >.5), axis=1)
df['lda_score_4'] = MinMaxScaler().fit_transform(negativeness_score_4.reshape(-1,1))
df['lda_predict_4'] = df.apply(lambda x: (x['lda_score_4'] >.5), axis=1)
negativeness_score_01 = get_dividing_vec_og(df, 2) # splits into 4, 0123
negativeness_score_34 = get_dividing_vec_og(df, 3) # splits into 4, 0123
df['lda_score_01'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))
df['lda_predict_01'] = df.apply(lambda x: (x['lda_score_01'] >.5), axis=1)
df['lda_score_34'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))
df['lda_predict_34'] = df.apply(lambda x: (x['lda_score_34'] >.5), axis=1)
from collections import Counter
neg_df = df[df['lda_predict_01'] == False]
# print(len(neg_df))
# Counter(neg_df['Sentiment'])
neg_df
negativeness_score_01_0 = get_dividing_vec_og(neg_df.copy(), 1) # splits into 4, 0123
neg_df['lda_score_01_0'] = MinMaxScaler().fit_transform(negativeness_score_01_0.reshape(-1,1))
neg_df['lda_predict_01_0'] = neg_df.apply(lambda x: (x['lda_score_01_0'] >.5), axis=1)
# neg_df
columns = ['Sentiment','lda_predict_01_0']
view_df = pd.DataFrame(neg_df, columns = columns)
view_df
Counter(neg_df['lda_predict_01_0'])
Counter(neg_df['Sentiment'])
columns = ['Sentiment','lda_predict_01','lda_predict_34','lda_predict_0', 'lda_predict_1','lda_predict_2','lda_predict_3','lda_predict_4']
view_df = pd.DataFrame(df, columns = columns)
view_df[view_df['Sentiment'] == 4]
view_df[view_df['Sentiment'] == 3]
df
columns = ['Sentiment','lda_score_0', 'lda_score_1','lda_score_2','lda_score_3','lda_score_4']
view_df = pd.DataFrame(df, columns = columns)
view_df['avg_0'] =
print(view_df['lda_score_0'].sum()/len(view_df))
print(view_df['lda_score_1'].sum()/len(view_df))
print(view_df['lda_score_2'].sum()/len(view_df))
print(view_df['lda_score_3'].sum()/len(view_df))
print(view_df['lda_score_4'].sum()/len(view_df))
view_df[view_df['Sentiment'] == 0 ]
# def split_into_neg_pos(df):
# # USE CENTEROID
def get_lda(df):
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
mask = df.PoN.astype(bool).values
n_centroid = tfidf_docs[mask].mean(axis=0)
p_centroid = tfidf_docs[~mask].mean(axis=0)
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
df['lda_predict'] = (df.lda_score > .5).astype(int)
return (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3), df, negativity_score, n_centroid, p_centroid
import pandas as pd
train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
train.to_csv('kaggle_csv.csv')
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df, neg_score, n_cent, p_cent = get_lda(df)
def get_lda_submission(df, n_centroid, p_centroid):
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
df['lda_predict'] = (df.lda_score > .5).astype(int)
return df
columns = ['Phrase']
new_df = pd.DataFrame(df, columns = columns)
# new_df
predicted_df = get_lda_submission(new_df, n_cent, p_cent)
predicted_df['actual'] = df['S0']
Counter(predicted_df['lda_predict'] )
predicted_df[(predicted_df.actual == 0) & (predicted_df.lda_predict == 0)].count()
predicted_df[(predicted_df.actual == 1) & (predicted_df.lda_predict == 0)].count()
predicted_df[(predicted_df.actual > 1) & (predicted_df.lda_predict == 0)].count()
predicted_df['bool'] = [0 if x < 2 else 1 for x in predicted_df['actual']]
predicted_df['check'] = predicted_df.apply(lambda x: (x['lda_predict'] == x['bool']), axis=1)
predicted_df
Counter(predicted_df['check'])
len(df)
121740/len(df)