Using this youtube tutorial

  1. Import data
  2. Change label columns to 1 and 0
  3. Split the columns
  4. Split into test and train
  5. TURN ALL DATA TO NUMBERS! Convert words/text data into numbers without losing much of the information

    a. Bag of Words is one way (gives number to each word -- inefficient)

    b. CountVectorizer is better!

     i. Counts number of words in document 
     ii. Number of features is the total number of (unique) words in corpus
In [2]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
In [4]:
df=pd.read_csv('spamham', sep='\t', names=['Status', 'Message'])
In [42]:
from tabulate import tabulate

def shorten(long_string):
    return long_string[:1] if len(long_string) < 21 else long_string[:20]

def df_for_tabulate(df, column):
    pretty_df = df.copy()
    pretty_df[column] = pretty_df.apply(lambda x: shorten(x[column]), axis = 1)
    return pretty_df
    
tabulate_df = df_for_tabulate(df, 'Message')
print(tabulate(tabulate_df[:5], tablefmt="rst", headers=tabulate_df.columns))
====  ========  ====================
  ..  Status    Message
====  ========  ====================
   0  ham       Go until jurong poin
   1  ham       Ok lar... Joking wif
   2  spam      Free entry in 2 a wk
   3  ham       U dun say so early h
   4  ham       Nah I don't think he
====  ========  ====================
In [6]:
len(df)
Out[6]:
5572
In [47]:
# w.female[w.female == 'female'] = 1 
# w.female[w.female == 'male']   = 0

df['Status'][df['Status'] == 'ham'] = 1
df['Status'][df['Status'] == 'spam'] = 0
In [49]:
df.head()
Out[49]:
Status Message
0 1 Go until jurong point, crazy.. Available only ...
1 1 Ok lar... Joking wif u oni...
2 0 Free entry in 2 a wkly comp to win FA Cup fina...
3 1 U dun say so early hor... U c already then say...
4 1 Nah I don't think he goes to usf, he lives aro...
In [51]:
# inverse_transform
# fit_transform

cv = CountVectorizer()
In [59]:
cv1 = cv.fit_transform(df['Message'])
cv1b = cv.inverse_transform(df['Message'])
In [60]:
cv1.toarray()
Out[60]:
array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])
In [61]:
cv1b
Out[61]:
[array(['00', '000', '000pes', ..., 'online', 'onluy', 'only'],
       dtype='<U34')]
In [ ]: