## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file)
results.append(f.read())
f.close()
return results
## =======================================================
## MODELING
## =======================================================
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
def run_lda(data):
cv = CountVectorizer()
lda_vec = cv.fit_transform(data)
lda_columns = cv.get_feature_names()
corpus = pd.DataFrame(lda_vec.toarray(), columns = lda_columns)
lda = LatentDirichletAllocation(n_components=2, max_iter=10,
learning_method='online')
lda_model = lda.fit_transform(lda_vec)
print_topics(lda, cv)
return lda_model, lda, lda_vec, cv
## =======================================================
## HELPERS
## =======================================================
import numpy as np
np.random.seed(210)
def print_topics(model, vectorizer, top_n=10):
for idx, topic in enumerate(model.components_):
print("Topic %d:" % (idx))
print([(vectorizer.get_feature_names()[i], topic[i])
for i in topic.argsort()[:-top_n - 1:-1]])
## =======================================================
## VISUALIZING
## =======================================================
import pyLDAvis.sklearn as LDAvis
import pyLDAvis
def start_vis(lda, lda_vec, cv):
panel = LDAvis.prepare(lda, lda_vec, cv, mds='tsne')
# pyLDAvis.show(panel)
pyLDAvis.save_html(panel, 'HW8_lda.html')
data = get_data_from_files('Dog_Hike/')
lda_model, lda, lda_vec, cv = run_lda(data)
start_vis(lda, lda_vec, cv)