Using this youtube tutorial
TURN ALL DATA TO NUMBERS! Convert words/text data into numbers without losing much of the information
a. Bag of Words is one way (gives number to each word -- inefficient)
b. CountVectorizer is better!
i. Counts number of words in document
ii. Number of features is the total number of (unique) words in corpus
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
df=pd.read_csv('spamham', sep='\t', names=['Status', 'Message'])
from tabulate import tabulate
def shorten(long_string):
return long_string[:1] if len(long_string) < 21 else long_string[:20]
def df_for_tabulate(df, column):
pretty_df = df.copy()
pretty_df[column] = pretty_df.apply(lambda x: shorten(x[column]), axis = 1)
return pretty_df
tabulate_df = df_for_tabulate(df, 'Message')
print(tabulate(tabulate_df[:5], tablefmt="rst", headers=tabulate_df.columns))
len(df)
# w.female[w.female == 'female'] = 1
# w.female[w.female == 'male'] = 0
df['Status'][df['Status'] == 'ham'] = 1
df['Status'][df['Status'] == 'spam'] = 0
df.head()
# inverse_transform
# fit_transform
cv = CountVectorizer()
cv1 = cv.fit_transform(df['Message'])
cv1b = cv.inverse_transform(df['Message'])
cv1.toarray()
cv1b