HW4

Attempt 1 using this as a guide

STEP 1: Import ALL the things!

In [32]:
# cleaning texts 
import pandas as pd 
import re 
import nltk 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
from sklearn.feature_extraction.text import CountVectorizer 
nltk.download('stopwords')
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/danielcaraway/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Out[32]:
True
In [33]:
dataset = [["I liked the movie", "positive"], 
           ["It’s a good movie. Nice story", "positive"], 
           ["Hero’s acting is bad but heroine looks good. Overall nice movie", "positive"], 
            ["Nice songs. But sadly boring ending.", "negative"], 
            ["sad movie, boring movie", "negative"]]

STEP 2: Clean the data

In [38]:
## 2a: Put data into a data frame

dataset = pd.DataFrame(dataset)
dataset.columns = ["Text", "Reviews"]
corpus = []
In [50]:
## 2b: Clean and process the data
## Processing includes: lowering, splitting, stemming 
## and removing non alpha characters with regex

for i in range(0,5):
    text = re.sub('[^a-zA-Z]', ' ', dataset['Text'][i])
    print(text)
    text = text.lower()
    text = text.split()
    ps = PorterStemmer()
    text = ' '.join(text)
    corpus.append(text)
I liked the movie
It s a good movie  Nice story
Hero s acting is bad but heroine looks good  Overall nice movie
Nice songs  But sadly boring ending 
sad movie  boring movie
In [40]:
## 2c: Check our handiwork 

corpus
Out[40]:
['i liked the movie',
 'it s a good movie nice story',
 'hero s acting is bad but heroine looks good overall nice movie',
 'nice songs but sadly boring ending',
 'sad movie boring movie']

STEP 3: Create Bag of Words

Using CountVectorizer

In [37]:
cv = CountVectorizer(max_features = 1500)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values

STEP 4: Split the data into train and test

In [43]:
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)

STEP 5: Fit Naive Bayes to Training Set

In [45]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

classifier = GaussianNB()
classifier.fit(x_train, y_train)
Out[45]:
GaussianNB(priors=None, var_smoothing=1e-09)
In [49]:
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm
Out[49]:
array([[0, 0],
       [2, 0]])