LDA¶

Tutorial Here

import pandas as pd
df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
print(df.target_names.unique())
df.head()

['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']
['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'
 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'
 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'
 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'
 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']

import re
text_corpus = df.content.values.tolist()
text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing email addresses
text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline characters
text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quote characters
 
print(text_corpus[1])

From: (Guy Kuo) Subject: SI Clock Poll - Final Call Summary: Final call for SI clock reports Keywords: SI,acceleration,clock,upgrade Article-I.D.: shelley.1qvfo9INNc3s Organization: University of Washington Lines: 11 NNTP-Posting-Host: carson.u.washington.edu A fair number of brave souls who upgraded their SI clock oscillator have shared their experiences for this poll. Please send a brief message detailing your experiences with the procedure. Top speed attained, CPU rated speed, add on cards and adapters, heat sinks, hour of usage per day, floppy disk functionality with 800 and 1.4 m floppies are especially requested. I will be summarizing in the next two days, so please add to the network knowledge base if you have done the clock upgrade and havent answered this poll. Thanks. Guy Kuo

import gensim
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
 
def doc_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

words = list(doc_to_words(text_corpus)) 
print(words[1])

['from', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'for', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'of', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'edu', 'fair', 'number', 'of', 'brave', 'souls', 'who', 'upgraded', 'their', 'si', 'clock', 'oscillator', 'have', 'shared', 'their', 'experiences', 'for', 'this', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'your', 'experiences', 'with', 'the', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'on', 'cards', 'and', 'adapters', 'heat', 'sinks', 'hour', 'of', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'with', 'and', 'floppies', 'are', 'especially', 'requested', 'will', 'be', 'summarizing', 'in', 'the', 'next', 'two', 'days', 'so', 'please', 'add', 'to', 'the', 'network', 'knowledge', 'base', 'if', 'you', 'have', 'done', 'the', 'clock', 'upgrade', 'and', 'havent', 'answered', 'this', 'poll', 'thanks', 'guy', 'kuo']

from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
 
def remove_stopwords(text):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in text_corpus]
 
words = remove_stopwords(words)
 
print(words[1])

['guy', 'kuo', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'souls', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experiences', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experiences', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'cards', 'adapters', 'heat', 'sinks', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'floppies', 'especially', 'requested', 'summarizing', 'next', 'two', 'days', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks', 'guy', 'kuo']

import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
 
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
 
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

0 documents lemmatised
500 documents lemmatised
1000 documents lemmatised
1500 documents lemmatised
2000 documents lemmatised
2500 documents lemmatised
3000 documents lemmatised
3500 documents lemmatised
4000 documents lemmatised
4500 documents lemmatised
5000 documents lemmatised
5500 documents lemmatised
6000 documents lemmatised
6500 documents lemmatised
7000 documents lemmatised
7500 documents lemmatised
8000 documents lemmatised
8500 documents lemmatised
9000 documents lemmatised
9500 documents lemmatised
10000 documents lemmatised
10500 documents lemmatised
11000 documents lemmatised

# Create Dictionary
import gensim.corpora as corpora
id2word = corpora.Dictionary(data_lemmatized)
 
# Create Corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
 
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           per_word_topics=True)

from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.013*"drive" + 0.012*"car" + 0.012*"not" + 0.009*"line" + '
  '0.008*"organization" + 0.007*"write" + 0.007*"article" + 0.007*"do" + '
  '0.006*"be" + 0.006*"light"'),
 (1,
  '0.695*"ax" + 0.048*"max" + 0.009*"_" + 0.004*"rlk" + 0.003*"bhj" + '
  '0.003*"ey" + 0.003*"giz" + 0.002*"qax" + 0.002*"tm" + 0.002*"chz"'),
 (2,
  '0.027*"key" + 0.015*"encryption" + 0.014*"clipper" + 0.014*"chip" + '
  '0.009*"government" + 0.008*"security" + 0.007*"escrow" + 0.007*"system" + '
  '0.007*"would" + 0.007*"public"'),
 (3,
  '0.008*"state" + 0.008*"not" + 0.006*"law" + 0.006*"write" + '
  '0.006*"armenian" + 0.005*"government" + 0.005*"line" + 0.005*"would" + '
  '0.004*"exist" + 0.004*"question"'),
 (4,
  '0.014*"not" + 0.010*"do" + 0.008*"would" + 0.008*"line" + 0.008*"write" + '
  '0.007*"year" + 0.007*"be" + 0.007*"get" + 0.007*"organization" + '
  '0.006*"time"'),
 (5,
  '0.017*"key" + 0.015*"bit" + 0.012*"line" + 0.012*"_" + 0.009*"organization" '
  '+ 0.008*"use" + 0.008*"number" + 0.008*"serial" + 0.008*"window" + '
  '0.008*"c"'),
 (6,
  '0.030*"space" + 0.012*"nasa" + 0.011*"orbit" + 0.010*"mission" + '
  '0.009*"mar" + 0.007*"earth" + 0.007*"satellite" + 0.006*"spacecraft" + '
  '0.006*"shuttle" + 0.006*"probe"'),
 (7,
  '0.013*"game" + 0.012*"hockey" + 0.010*"cub" + 0.009*"league" + '
  '0.008*"division" + 0.008*"db" + 0.007*"lose" + 0.006*"hawk" + 0.006*"line" '
  '+ 0.006*"min"'),
 (8,
  '0.015*"not" + 0.010*"say" + 0.009*"do" + 0.009*"people" + 0.008*"would" + '
  '0.007*"be" + 0.007*"write" + 0.007*"think" + 0.007*"god" + 0.007*"know"'),
 (9,
  '0.013*"greek" + 0.010*"article" + 0.009*"write" + 0.009*"organization" + '
  '0.009*"get" + 0.009*"line" + 0.008*"not" + 0.007*"car" + 0.007*"greece" + '
  '0.007*"dealer"'),
 (10,
  '0.014*"wire" + 0.011*"ground" + 0.008*"ax" + 0.008*"line" + '
  '0.008*"organization" + 0.007*"outlet" + 0.006*"post" + 0.006*"neutral" + '
  '0.006*"wiring" + 0.006*"write"'),
 (11,
  '0.029*"bike" + 0.014*"line" + 0.013*"dod" + 0.012*"organization" + '
  '0.012*"motorcycle" + 0.012*"ride" + 0.011*"rider" + 0.008*"post" + '
  '0.007*"nntp" + 0.007*"host"'),
 (12,
  '0.021*"not" + 0.012*"do" + 0.010*"write" + 0.008*"be" + 0.008*"line" + '
  '0.007*"get" + 0.007*"would" + 0.007*"drive" + 0.007*"scsi" + '
  '0.007*"article"'),
 (13,
  '0.010*"write" + 0.008*"line" + 0.008*"not" + 0.008*"article" + '
  '0.008*"organization" + 0.006*"would" + 0.006*"israel" + 0.006*"do" + '
  '0.006*"state" + 0.006*"right"'),
 (14,
  '0.015*"gun" + 0.012*"game" + 0.011*"team" + 0.008*"not" + 0.008*"play" + '
  '0.007*"get" + 0.007*"line" + 0.007*"organization" + 0.007*"year" + '
  '0.006*"go"'),
 (15,
  '0.007*"mail" + 0.007*"_" + 0.006*"file" + 0.006*"cx" + 0.006*"ripem" + '
  '0.006*"line" + 0.006*"list" + 0.006*"information" + 0.005*"post" + '
  '0.005*"available"'),
 (16,
  '0.014*"line" + 0.014*"drive" + 0.012*"organization" + 0.010*"card" + '
  '0.009*"university" + 0.008*"disk" + 0.008*"driver" + 0.008*"post" + '
  '0.008*"video" + 0.008*"write"'),
 (17,
  '0.011*"new" + 0.011*"line" + 0.010*"organization" + 0.009*"gm" + '
  '0.008*"university" + 0.008*"not" + 0.007*"car" + 0.006*"write" + '
  '0.006*"good" + 0.005*"think"'),
 (18,
  '0.020*"line" + 0.016*"organization" + 0.011*"post" + 0.011*"file" + '
  '0.010*"host" + 0.009*"nntp" + 0.008*"program" + 0.008*"window" + '
  '0.007*"university" + 0.007*"get"'),
 (19,
  '0.010*"not" + 0.007*"do" + 0.007*"get" + 0.006*"window" + '
  '0.005*"organization" + 0.005*"be" + 0.005*"line" + 0.005*"space" + '
  '0.005*"would" + 0.005*"use"')]

# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim  
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis

---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-27-0ce1051c8de2> in <module>
      3 import pyLDAvis.gensim
      4 pyLDAvis.enable_notebook()
----> 5 vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
      6 vis

~/anaconda3/lib/python3.7/site-packages/pyLDAvis/gensim.py in prepare(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)
    117     """
    118     opts = fp.merge(_extract_data(topic_model, corpus, dictionary, doc_topic_dist), kwargs)
--> 119     return vis_prepare(**opts)

~/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py in prepare(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, R, lambda_step, mds, n_jobs, plot_opts, sort_topics)
    396    term_frequency = np.sum(term_topic_freq, axis=0)
    397 
--> 398    topic_info         = _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs)
    399    token_table        = _token_table(topic_info, term_topic_freq, vocab, term_frequency)
    400    topic_coordinates = _topic_coordinates(mds, topic_term_dists, topic_proportion)

~/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py in _topic_info(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs)
    253 
    254    top_terms = pd.concat(Parallel(n_jobs=n_jobs)(delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls) \
--> 255                                                  for ls in _job_chunks(lambda_seq, n_jobs)))
    256    topic_dfs = map(topic_top_term_df, enumerate(top_terms.T.iterrows(), 1))
    257    return pd.concat([default_term_info] + list(topic_dfs))

~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
    932 
    933             with self._backend.retrieval_context():
--> 934                 self.retrieve()
    935             # Make sure that we get a last message telling us we are done
    936             elapsed_time = time.time() - self._start_time

~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
    831             try:
    832                 if getattr(self._backend, 'supports_timeout', False):
--> 833                     self._output.extend(job.get(timeout=self.timeout))
    834                 else:
    835                     self._output.extend(job.get())

~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
    519         AsyncResults.get from multiprocessing."""
    520         try:
--> 521             return future.result(timeout=timeout)
    522         except LokyTimeoutError:
    523             raise TimeoutError()

~/anaconda3/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
    428                 return self.__get_result()
    429 
--> 430             self._condition.wait(timeout)
    431 
    432             if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:

~/anaconda3/lib/python3.7/threading.py in wait(self, timeout)
    294         try:    # restore state no matter what (e.g., KeyboardInterrupt)
    295             if timeout is None:
--> 296                 waiter.acquire()
    297                 gotit = True
    298             else:

KeyboardInterrupt:

def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=text_corpus):
    # Array of top 10 topics
    top10array = []
 
    for row in range(ldamodel.num_topics):
        wp = ldamodel.show_topic(row)
        topic_keywords = ", ".join([word for word, prop in wp])
        top10array.append((row+1, topic_keywords))
 
    top10dict = dict(top10array)
 
    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) for topic in ldamodel[corpus]])[0])
    sent_topics_df.columns=["Data"]
    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
 
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
    return(sent_topics_df)
 
df_topic_sents_keywords = format_topics_sentences()

# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()
 
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
 
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)
 
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
 
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
 
# Show
sent_topics_sorteddf_mallet.head()

	content	target	target_names
0	From: lerxst@wam.umd.edu (where's my thing)\nS...	7	rec.autos
1	From: guykuo@carson.u.washington.edu (Guy Kuo)...	4	comp.sys.mac.hardware
2	From: twillis@ec.ecn.purdue.edu (Thomas E Will...	4	comp.sys.mac.hardware
3	From: jgreen@amber (Joe Green)\nSubject: Re: W...	1	comp.graphics
4	From: jcm@head-cfa.harvard.edu (Jonathan McDow...	14	sci.space