HW8 HW7 V4

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize.casual import casual_tokenize
tfidf_model = TfidfVectorizer(tokenizer = casual_tokenize)
from sklearn.preprocessing import MinMaxScaler
In [9]:
from collections import Counter
import pandas as pd
train=pd.read_csv("../WK7/kaggle-sentiment/train.tsv", delimiter='\t')
y=train['Sentiment'].values
X=train['Phrase'].values
train.to_csv('kaggle_csv.csv')

df = pd.DataFrame()
df['Phrase'] = train['Phrase']
df['S0'] = train['Sentiment']

def get_lda(df):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    mask = df.PoN.astype(bool).values
    n_centroid = tfidf_docs[mask].mean(axis=0)
    p_centroid = tfidf_docs[~mask].mean(axis=0)
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    score = (1. - (df.PoN - df.lda_predict).abs().sum() / len(df)).round(3) 
    return score, df, negativity_score, n_centroid, p_centroid

def get_lda_submission(df, n_centroid, p_centroid):
    tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
    negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
    df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
    df['lda_predict'] = (df.lda_score > .5).astype(int)
    return df

df['PoN'] = [0 if x < 2 else 1 for x in df['S0']]
len(df[df['PoN'] == 1])
lda_score, df, neg_score, n_cent, p_cent = get_lda(df)

columns = ['Phrase']
new_df =  pd.DataFrame(df, columns = columns)
predicted_df = get_lda_submission(new_df, n_cent, p_cent)

predicted_df['actual'] = df['S0']
predicted_df['bool'] = [0 if x < 2 else 1 for x in predicted_df['actual']]
predicted_df['check'] = predicted_df.apply(lambda x: (x['lda_predict'] == x['bool']), axis=1)

Counter(predicted_df['check'])
Out[9]:
Counter({False: 34320, True: 121740})
In [14]:
predicted_df_test = get_lda_submission(test, n_cent, p_cent)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-14-37062cd7aca6> in <module>
----> 1 predicted_df_test = get_lda_submission(test, n_cent, p_cent)

<ipython-input-9-872ca310f106> in get_lda_submission(df, n_centroid, p_centroid)
     25 def get_lda_submission(df, n_centroid, p_centroid):
     26     tfidf_docs = tfidf_model.fit_transform(raw_documents = df['Phrase']).toarray()
---> 27     negativity_score = tfidf_docs.dot(n_centroid - p_centroid)
     28     df['lda_score'] = MinMaxScaler().fit_transform(negativity_score.reshape(-1,1))
     29     df['lda_predict'] = (df.lda_score > .5).astype(int)

ValueError: shapes (66292,9970) and (16394,) not aligned: 9970 (dim 1) != 16394 (dim 0)
In [16]:
print(len(n_cent))
print(len(p_cent))
print(len(test))
16394
16394
66292
In [17]:
test=pd.read_csv("../WK7/kaggle-sentiment/test.tsv", delimiter='\t')
test
Out[17]:
PhraseId SentenceId Phrase
0 156061 8545 An intermittently pleasing but mostly routine ...
1 156062 8545 An intermittently pleasing but mostly routine ...
2 156063 8545 An
3 156064 8545 intermittently pleasing but mostly routine effort
4 156065 8545 intermittently pleasing but mostly routine
... ... ... ...
66287 222348 11855 A long-winded , predictable scenario .
66288 222349 11855 A long-winded , predictable scenario
66289 222350 11855 A long-winded ,
66290 222351 11855 A long-winded
66291 222352 11855 predictable scenario

66292 rows × 3 columns

In [ ]:
training_encoded = pd.get_dummies(train, columns=['Phrase'])
test_encoded = pd.get_dummies(test, columns=['Phrase'])
test_encoded_for_model = test_encoded.reindex(columns = training_encoded.columns, 
    fill_value=0)
In [ ]: