20 Newsgroups

Using this tutorial! | 10-22-19

In [28]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import fetch_20newsgroups_vectorized
newsgroups_train = fetch_20newsgroups(subset='train')
# newsgroups_train = fetch_20newsgroups(subset='train')
from pprint import pprint
pprint(list(newsgroups_train.target_names))
['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

The data is in filenames and target attributes (target is integer index of category)

In [3]:
# categories = ['alt.atheism', 'sci.space']
# atheism = ['alt.atheism']
# ng_atheism = fetch_20newsgroups(subset='train', categories=atheism)
In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
categories = ['alt.atheism', 'talk.religion.misc',
              'comp.graphics', 'sci.space']
newsgroups_train = fetch_20newsgroups(subset='train',
                                      categories=categories)
# newsgroups_train = fetch_20newsgroups(subset='train',
#                                       categories=categories)
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(newsgroups_train.data)
vectors.shape
Out[31]:
(2034, 34118)
In [32]:
# vectorizer = TfidfVectorizer()
# vectors = vectorizer.fit_transform(ng_atheism.data)
# vectors.shape
In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
newsgroups_test = fetch_20newsgroups(subset='test',
                                     categories=categories)
vectors_test = vectorizer.transform(newsgroups_test.data)
clf = MultinomialNB(alpha=.01)
clf.fit(vectors, newsgroups_train.target)
pred = clf.predict(vectors_test)
metrics.f1_score(newsgroups_test.target, pred, average='macro')
Out[33]:
0.8821359240272957
In [34]:
import numpy as np
In [35]:
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        top10 = np.argsort(classifier.coef_[i])[-10:]
        print("%s: %s" % (category, " ".join(feature_names[top10])))
In [40]:
import numpy as np
def show_top10(classifier, vectorizer, categories):
    feature_names = np.asarray(vectorizer.get_feature_names())
    for i, category in enumerate(categories):
        print(np.argsort(classifier.coef_[i]))
#         top10 = np.argsort(classifier.coef_[i])[-10:]
#         print("%s: %s" % (category, " ".join(feature_names[top10])))

show_top10(clf, vectorizer, newsgroups_train.target_names)
[17058 20259 20258 ... 22657 30993 30643]
[17058 19716 19715 ... 22657 30993 30643]
[34117 28442 28443 ... 30993 22657 30643]
[17058 17915 17913 ... 30993 22657 30643]
In [ ]: