from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
import pandas as pd
train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
train.to_csv('kaggle_csv.csv')
df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']
def get_lda(df):
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
mask = df.PoN.astype(bool).values
n_centroid = tfidf_docs[mask].mean(axis=0)
p_centroid = tfidf_docs[~mask].mean(axis=0)
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
df['lda_predict'] = (df.lda_score > .5).astype(int)
score = (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3)
return score, df, negativity_score, n_centroid, p_centroid
def get_lda_submission(df, n_centroid, p_centroid):
tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
df['lda_predict'] = (df.lda_score > .5).astype(int)
return df
df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df, neg_score, n_cent, p_cent = get_lda(df)
columns = ['Phrase']
new_df = pd.DataFrame(df, columns = columns)
predicted_df = get_lda_submission(new_df, n_cent, p_cent)
predicted_df['actual'] = df['S0']
predicted_df['bool'] = [0 if x < 2 else 1 for x in predicted_df['actual']]
predicted_df['check'] = predicted_df.apply(lambda x: (x['lda_predict'] == x['bool']), axis=1)
Counter(predicted_df['check'])
predicted_df_test = get_lda_submission(test, n_cent, p_cent)
print(len(n_cent))
print(len(p_cent))
print(len(test))
test=pd.read_csv("../WK7/kaggle-sentiment/test.tsv", delimiter='\t')
test
training_encoded = pd.get_dummies(train, columns=['Phrase'])
test_encoded = pd.get_dummies(test, columns=['Phrase'])
test_encoded_for_model = test_encoded.reindex(columns = training_encoded.columns,
fill_value=0)