HW8: Topic Modeling

In [ ]:
 
In [19]:
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results


## =======================================================
## MODELING
## =======================================================
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def run_lda(data):
    cv = CountVectorizer()
    lda_vec = cv.fit_transform(data)
    lda_columns = cv.get_feature_names()
    corpus = pd.DataFrame(lda_vec.toarray(), columns = lda_columns)
    lda = LatentDirichletAllocation(n_components=2, max_iter=10, 
                                    learning_method='online')
    lda_model = lda.fit_transform(lda_vec)
    print_topics(lda, cv)
    return lda_model, lda, lda_vec, cv


## =======================================================
## HELPERS
## =======================================================
import numpy as np
np.random.seed(210)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        

## =======================================================
## VISUALIZING
## =======================================================        
import pyLDAvis.sklearn as LDAvis
import pyLDAvis

def start_vis(lda, lda_vec, cv):
    panel = LDAvis.prepare(lda, lda_vec, cv, mds='tsne')
#     pyLDAvis.show(panel)
    pyLDAvis.save_html(panel, 'HW8_lda.html')
In [20]:
data = get_data_from_files('Dog_Hike/')
lda_model, lda, lda_vec, cv = run_lda(data)
Topic 0:
[('dogs', 12.863012324998264), ('your', 8.112585989160534), ('and', 7.112589626251079), ('to', 6.543319792637165), ('dog', 5.616325392881221), ('can', 4.778847129376059), ('are', 4.624200467127805), ('like', 3.835997592426934), ('need', 3.7731362122576666), ('attention', 3.7642408038271964)]
Topic 1:
[('hiking', 8.337735772665093), ('or', 5.355394560812078), ('and', 5.306256862471911), ('hike', 5.191322646096004), ('are', 4.583011319081186), ('is', 4.507842325278862), ('some', 4.505087048930418), ('can', 4.440094214842077), ('dogs', 3.642509208823395), ('gear', 3.601087809730814)]
In [21]:
start_vis(lda, lda_vec, cv)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))
In [ ]: