Using this tutorial! | 10-22-19
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_20newsgroups_vectorized
newsgroups_train = fetch_20newsgroups(subset='train')
# newsgroups_train = fetch_20newsgroups(subset='train')
from pprint import pprint
pprint(list(newsgroups_train.target_names))
The data is in filenames
and target
attributes (target is integer index of category)
# categories = ['alt.atheism', 'sci.space']
# atheism = ['alt.atheism']
# ng_atheism = fetch_20newsgroups(subset='train', categories=atheism)
from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism', 'talk.religion.misc',
'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
categories=categories)
# newsgroups_train = fetch_20newsgroups(subset='train',
# categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape
# vectorizer = TfidfVectorizer()
# vectors = vectorizer.fit_transform(ng_atheism.data)
# vectors.shape
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
newsgroups_test = fetch_20newsgroups(subset='test',
categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')
import numpy as np
def show_top10(classifier, vectorizer, categories):
feature_names = np.asarray(vectorizer.get_feature_names())
for i, category in enumerate(categories):
top10 = np.argsort(classifier.coef_[i])[-10:]
print("%s: %s" % (category, " ".join(feature_names[top10])))
import numpy as np
def show_top10(classifier, vectorizer, categories):
feature_names = np.asarray(vectorizer.get_feature_names())
for i, category in enumerate(categories):
print(np.argsort(classifier.coef_[i]))
# top10 = np.argsort(classifier.coef_[i])[-10:]
# print("%s: %s" % (category, " ".join(feature_names[top10])))
show_top10(clf, vectorizer, newsgroups_train.target_names)