from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
og_df = pd.read_csv('kaggle_csv.csv')
df_sm = og_df.copy()
df_sm = df_sm.sample(frac=0.10, replace=True, random_state=1)
df = df_sm.copy()
len(df_sm)
def get_dividing_vec(df, num):
df['PoN'] = [0 if x < num else 1 for x in df['Sentiment']]
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
mask = df['PoN'].astype(bool).values
n_centroid = tfidf_docs[mask].mean(axis=0)
p_centroid = tfidf_docs[~mask].mean(axis=0)
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
return negativity_score
negativeness_score_01 = get_dividing_vec(df, 2) # splits into 01, 234
negativeness_score_34 = get_dividing_vec(df, 3) # splits into 012, 34
# def get_lda_score(negativeness_score):
# return MinMaxScaler().fit_transform(negativeness_score.reshape(-1,1))
# df['lda_score'] = df.apply(lambda x: get_lda_score(x['Phrase']), axis=1)
# df['lda_score'] = df.apply(lambda x: get_lda_score(negativeness_score), axis=1)
# df['lda_predict'] = (df.lda_score > .5).astype(int)
df['lda_score_01'] = MinMaxScaler().fit_transform(negativeness_score_01.reshape(-1,1))
df['lda_predict_01'] = df.apply(lambda x: (x['lda_score_01'] >.5), axis=1)
df['lda_score_34'] = MinMaxScaler().fit_transform(negativeness_score_34.reshape(-1,1))
df['lda_predict_34'] = df.apply(lambda x: (x['lda_score_34'] >.5), axis=1)
def calculate_actual(row):
if (row['lda_predict_01'] == False) and (row['lda_predict_34'] == False):
return 0
elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):
return 1
elif (row['lda_predict_01'] == True) and (row['lda_predict_34'] == False):
return 3
df['actual'] = df.apply