This is a tutorial on using sklearn for topic modeling.
References:
(2) https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730 https://nlpforhackers.io/topic-modeling/
(3) https://nlpforhackers.io/topic-modeling/
(4) http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data
print(dataset.target_names)
print(len(documents))
#print(documents[0])
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
no_features = 1000
# NMF is able to use tf-idf
#tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
#tfidf = tfidf_vectorizer.fit_transform(documents)
#tfidf_feature_names = tfidf_vectorizer.get_feature_names()
# LDA can only use raw term counts for LDA because it is a probabilistic graphical model
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()
from sklearn.decomposition import NMF, LatentDirichletAllocation
no_topics = 20
# Run NMF
#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)
# Run LDA
lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)
lda_z = lda.fit_transform(tf)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print("Topic %d:" % (topic_idx))
print(" ".join([feature_names[i]
for i in topic.argsort()[:-no_top_words - 1:-1]]))
no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)
#print(lda.components_[0][:5])
print(lda_z.shape)
print(lda_z[0])
print(documents[0])
#print(documents.shape)
#display_topics(nmf, tfidf_feature_names, no_top_words)
# to read in files from a folder
import os
files = {}
filepath = '../A-data/mallet-sample-data'
for filename in os.listdir(filepath):
print(filename)
if filename.endswith(".txt"):
fpath = filepath + '/' + filename
with open(fpath, "r") as file:
files[filename] = file.read()
print(len(files))
#for filename, text in files.items():
# print(filename)
# print("=" * 80)
# print(text)