Attempt 1 using this as a guide
# cleaning texts
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
dataset = [["I liked the movie", "positive"],
["It’s a good movie. Nice story", "positive"],
["Hero’s acting is bad but heroine looks good. Overall nice movie", "positive"],
["Nice songs. But sadly boring ending.", "negative"],
["sad movie, boring movie", "negative"]]
## 2a: Put data into a data frame
dataset = pd.DataFrame(dataset)
dataset.columns = ["Text", "Reviews"]
corpus = []
## 2b: Clean and process the data
## Processing includes: lowering, splitting, stemming
## and removing non alpha characters with regex
for i in range(0,5):
text = re.sub('[^a-zA-Z]', ' ', dataset['Text'][i])
print(text)
text = text.lower()
text = text.split()
ps = PorterStemmer()
text = ' '.join(text)
corpus.append(text)
## 2c: Check our handiwork
corpus
Using CountVectorizer
cv = CountVectorizer(max_features = 1500)
x = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:,1].values
# from sklearn.cross_validation import train_test_split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
classifier = GaussianNB()
classifier.fit(x_train, y_train)
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm