In [2]:
import pandas as pd
df = pd.read_json('')
['' 'comp.sys.mac.hardware' '' ''
 'talk.politics.guns' '' ''
 '' '' 'talk.religion.misc'
 '' 'alt.atheism' 'sci.electronics' ''
 '' '' 'soc.religion.christian'
 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']
content target target_names
0 From: (where's my thing)\nS... 7
1 From: (Guy Kuo)... 4 comp.sys.mac.hardware
2 From: (Thomas E Will... 4 comp.sys.mac.hardware
3 From: jgreen@amber (Joe Green)\nSubject: Re: W... 1
4 From: (Jonathan McDow... 14
In [4]:
import re
text_corpus = df.content.values.tolist()
text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing email addresses
text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline characters
text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quote characters
From: (Guy Kuo) Subject: SI Clock Poll - Final Call Summary: Final call for SI clock reports Keywords: SI,acceleration,clock,upgrade Article-I.D.: shelley.1qvfo9INNc3s Organization: University of Washington Lines: 11 NNTP-Posting-Host: A fair number of brave souls who upgraded their SI clock oscillator have shared their experiences for this poll. Please send a brief message detailing your experiences with the procedure. Top speed attained, CPU rated speed, add on cards and adapters, heat sinks, hour of usage per day, floppy disk functionality with 800 and 1.4 m floppies are especially requested. I will be summarizing in the next two days, so please add to the network knowledge base if you have done the clock upgrade and havent answered this poll. Thanks. Guy Kuo 
In [18]:
import gensim
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
def doc_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

words = list(doc_to_words(text_corpus)) 
['from', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'for', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'of', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'edu', 'fair', 'number', 'of', 'brave', 'souls', 'who', 'upgraded', 'their', 'si', 'clock', 'oscillator', 'have', 'shared', 'their', 'experiences', 'for', 'this', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'your', 'experiences', 'with', 'the', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'on', 'cards', 'and', 'adapters', 'heat', 'sinks', 'hour', 'of', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'with', 'and', 'floppies', 'are', 'especially', 'requested', 'will', 'be', 'summarizing', 'in', 'the', 'next', 'two', 'days', 'so', 'please', 'add', 'to', 'the', 'network', 'knowledge', 'base', 'if', 'you', 'have', 'done', 'the', 'clock', 'upgrade', 'and', 'havent', 'answered', 'this', 'poll', 'thanks', 'guy', 'kuo']
In [19]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def remove_stopwords(text):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in text_corpus]
words = remove_stopwords(words)
['guy', 'kuo', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'souls', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experiences', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experiences', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'cards', 'adapters', 'heat', 'sinks', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'floppies', 'especially', 'requested', 'summarizing', 'next', 'two', 'days', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks', 'guy', 'kuo']
In [20]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
In [25]:
# Create Dictionary
import gensim.corpora as corpora
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
In [26]:
from pprint import pprint
doc_lda = lda_model[corpus]
  '0.013*"drive" + 0.012*"car" + 0.012*"not" + 0.009*"line" + '
  '0.008*"organization" + 0.007*"write" + 0.007*"article" + 0.007*"do" + '
  '0.006*"be" + 0.006*"light"'),
  '0.695*"ax" + 0.048*"max" + 0.009*"_" + 0.004*"rlk" + 0.003*"bhj" + '
  '0.003*"ey" + 0.003*"giz" + 0.002*"qax" + 0.002*"tm" + 0.002*"chz"'),
  '0.027*"key" + 0.015*"encryption" + 0.014*"clipper" + 0.014*"chip" + '
  '0.009*"government" + 0.008*"security" + 0.007*"escrow" + 0.007*"system" + '
  '0.007*"would" + 0.007*"public"'),
  '0.008*"state" + 0.008*"not" + 0.006*"law" + 0.006*"write" + '
  '0.006*"armenian" + 0.005*"government" + 0.005*"line" + 0.005*"would" + '
  '0.004*"exist" + 0.004*"question"'),
  '0.014*"not" + 0.010*"do" + 0.008*"would" + 0.008*"line" + 0.008*"write" + '
  '0.007*"year" + 0.007*"be" + 0.007*"get" + 0.007*"organization" + '
  '0.017*"key" + 0.015*"bit" + 0.012*"line" + 0.012*"_" + 0.009*"organization" '
  '+ 0.008*"use" + 0.008*"number" + 0.008*"serial" + 0.008*"window" + '
  '0.030*"space" + 0.012*"nasa" + 0.011*"orbit" + 0.010*"mission" + '
  '0.009*"mar" + 0.007*"earth" + 0.007*"satellite" + 0.006*"spacecraft" + '
  '0.006*"shuttle" + 0.006*"probe"'),
  '0.013*"game" + 0.012*"hockey" + 0.010*"cub" + 0.009*"league" + '
  '0.008*"division" + 0.008*"db" + 0.007*"lose" + 0.006*"hawk" + 0.006*"line" '
  '+ 0.006*"min"'),
  '0.015*"not" + 0.010*"say" + 0.009*"do" + 0.009*"people" + 0.008*"would" + '
  '0.007*"be" + 0.007*"write" + 0.007*"think" + 0.007*"god" + 0.007*"know"'),
  '0.013*"greek" + 0.010*"article" + 0.009*"write" + 0.009*"organization" + '
  '0.009*"get" + 0.009*"line" + 0.008*"not" + 0.007*"car" + 0.007*"greece" + '
  '0.014*"wire" + 0.011*"ground" + 0.008*"ax" + 0.008*"line" + '
  '0.008*"organization" + 0.007*"outlet" + 0.006*"post" + 0.006*"neutral" + '
  '0.006*"wiring" + 0.006*"write"'),
  '0.029*"bike" + 0.014*"line" + 0.013*"dod" + 0.012*"organization" + '
  '0.012*"motorcycle" + 0.012*"ride" + 0.011*"rider" + 0.008*"post" + '
  '0.007*"nntp" + 0.007*"host"'),
  '0.021*"not" + 0.012*"do" + 0.010*"write" + 0.008*"be" + 0.008*"line" + '
  '0.007*"get" + 0.007*"would" + 0.007*"drive" + 0.007*"scsi" + '
  '0.010*"write" + 0.008*"line" + 0.008*"not" + 0.008*"article" + '
  '0.008*"organization" + 0.006*"would" + 0.006*"israel" + 0.006*"do" + '
  '0.006*"state" + 0.006*"right"'),
  '0.015*"gun" + 0.012*"game" + 0.011*"team" + 0.008*"not" + 0.008*"play" + '
  '0.007*"get" + 0.007*"line" + 0.007*"organization" + 0.007*"year" + '
  '0.007*"mail" + 0.007*"_" + 0.006*"file" + 0.006*"cx" + 0.006*"ripem" + '
  '0.006*"line" + 0.006*"list" + 0.006*"information" + 0.005*"post" + '
  '0.014*"line" + 0.014*"drive" + 0.012*"organization" + 0.010*"card" + '
  '0.009*"university" + 0.008*"disk" + 0.008*"driver" + 0.008*"post" + '
  '0.008*"video" + 0.008*"write"'),
  '0.011*"new" + 0.011*"line" + 0.010*"organization" + 0.009*"gm" + '
  '0.008*"university" + 0.008*"not" + 0.007*"car" + 0.006*"write" + '
  '0.006*"good" + 0.005*"think"'),
  '0.020*"line" + 0.016*"organization" + 0.011*"post" + 0.011*"file" + '
  '0.010*"host" + 0.009*"nntp" + 0.008*"program" + 0.008*"window" + '
  '0.007*"university" + 0.007*"get"'),
  '0.010*"not" + 0.007*"do" + 0.007*"get" + 0.006*"window" + '
  '0.005*"organization" + 0.005*"be" + 0.005*"line" + 0.005*"space" + '
  '0.005*"would" + 0.005*"use"')]
In [27]:
# Visualize the topics
import pyLDAvis
import pyLDAvis.gensim  
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
In [ ]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=text_corpus):
    # Array of top 10 topics
    top10array = []
    for row in range(ldamodel.num_topics):
        wp = ldamodel.show_topic(row)
        topic_keywords = ", ".join([word for word, prop in wp])
        top10array.append((row+1, topic_keywords))
    top10dict = dict(top10array)
    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) for topic in ldamodel[corpus]])[0])
    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
df_topic_sents_keywords = format_topics_sentences()
In [ ]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
# Show
In [ ]: