import numpy as np
import pandas as pd
tr = pd.read_csv("kaggle-sentiment/train.tsv", delimiter = "\t")
tr[:5]
y_labels = tr['Sentiment'].values
X_data = tr['Phrase'].values
print(y_labels[:5])
print(X_data[:5])
HOLD-OUT TEST
¶from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_data, y_labels, test_size=0.4, random_state=0)
def get_array_for_barchart(dataset):
unique, counts = np.unique(dataset, return_counts=True)
return unique, counts
bc_y_train = get_array_for_barchart(y_train)
bc_y_test = get_array_for_barchart(y_test)
bc_y_train
bc_y_test
import seaborn as sns
import matplotlib.pyplot as plt
x = bc_y_train[0].tolist()
train = bc_y_train[1].tolist()
test = bc_y_test[1].tolist()
# number_of_bars
n_bars = 2
df = pd.DataFrame(zip(x*n_bars, ["z"]*len(x)+["k"]*len(x), z+k), columns=["sentiment", "dataset", "data"])
plt.figure(figsize=(10, 6))
sns.barplot(x="sentiment", hue="dataset", y="data", data=df)
plt.show()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
bool_vect = CountVectorizer(encoding='latin-1', binary=True, min_df=5, stop_words='english')
count_vect = CountVectorizer(encoding='latin-1', binary=False, min_df=5, stop_words='english')
FIT
: Collect unique tokens into the vocabulary (fit()
)
TRANSFORM
: Convert each doc to vector based on vocabulary (transform()
)
OR fit_transform()
NOTE: Only use the vocab constructed from the training data to vectorize the test data
(we must use transform
only not fit_transform
which would generate new vocab from test data)
X_tr_vect = count_vect.fit_transform(X_train)
list(count_vect.vocabulary_)[:5]
list(count_vect.vocabulary_.items())[:5]
sorted(count_vect.vocabulary_.items(),key=lambda item: item[1])[-5:]
from sklearn.naive_bayes import MultinomialNB
nb_clf = MultinomialNB()
nb_clf.fit(X_tr_vect, y_train)