Using this youtube tutorial

Import data
Change label columns to 1 and 0
Split the columns
Split into test and train
TURN ALL DATA TO NUMBERS! Convert words/text data into numbers without losing much of the information

a. Bag of Words is one way (gives number to each word -- inefficient)

b. CountVectorizer is better!
```
 i. Counts number of words in document 
 ii. Number of features is the total number of (unique) words in corpus
```

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

df=pd.read_csv('spamham', sep='\t', names=['Status', 'Message'])

from tabulate import tabulate

def shorten(long_string):
    return long_string[:1] if len(long_string) < 21 else long_string[:20]

def df_for_tabulate(df, column):
    pretty_df = df.copy()
    pretty_df[column] = pretty_df.apply(lambda x: shorten(x[column]), axis = 1)
    return pretty_df
    
tabulate_df = df_for_tabulate(df, 'Message')
print(tabulate(tabulate_df[:5], tablefmt="rst", headers=tabulate_df.columns))

====  ========  ====================
  ..  Status    Message
====  ========  ====================
   0  ham       Go until jurong poin
   1  ham       Ok lar... Joking wif
   2  spam      Free entry in 2 a wk
   3  ham       U dun say so early h
   4  ham       Nah I don't think he
====  ========  ====================

len(df)

5572

# w.female[w.female == 'female'] = 1 
# w.female[w.female == 'male']   = 0

df['Status'][df['Status'] == 'ham'] = 1
df['Status'][df['Status'] == 'spam'] = 0

df.head()

# inverse_transform
# fit_transform

cv = CountVectorizer()

cv1 = cv.fit_transform(df['Message'])
cv1b = cv.inverse_transform(df['Message'])

cv1.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

cv1b

[array(['00', '000', '000pes', ..., 'online', 'onluy', 'only'],
       dtype='<U34')]

	Status	Message
0	1	Go until jurong point, crazy.. Available only ...
1	1	Ok lar... Joking wif u oni...
2	0	Free entry in 2 a wkly comp to win FA Cup fina...
3	1	U dun say so early hor... U c already then say...
4	1	Nah I don't think he goes to usf, he lives aro...