LDA Clean¶

OVERVIEW¶

1. GET the data
2. CLEAN the data with regex
3. TOKENIZE the data
4. REMOVE stopwords from the data

import pandas as pd
def get_data(url):
    df = pd.read_json(url)
    return df

def do_some_eda(df):
    print(df.target_names.unique())
    df.head()

import re
def clean_data(df):
    text_corpus = df.content.values.tolist()
    text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing emails
    text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline 
    text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quotes
    return text_corpus

import gensim
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
 
def doc_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

        
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
 
def remove_stopwords(text):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in text_corpus]
 
    
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
 
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
 
# Create Dictionary
import gensim.corpora as corpora
def get_model(data_lemmatized):
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    corpus = [id2word.doc2bow(text) for text in data_lemmatized]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=20, 
                                               per_word_topics=True)
    return corpus, lda_model


def format_topics_sentences(ldamodel, corpus, texts):
    # Array of top 10 topics
    top10array = []
 
    for row in range(ldamodel.num_topics):
        wp = ldamodel.show_topic(row)
        topic_keywords = ", ".join([word for word, prop in wp])
        top10array.append((row+1, topic_keywords))
 
    top10dict = dict(top10array)
 
    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) 
                                                for topic in ldamodel[corpus]])[0])
    sent_topics_df.columns=["Data"]
    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
 
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
    return(sent_topics_df)
 


# def do_the_thing():
df = get_data('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
# do_some_eda(df)
text_corpus = clean_data(df)
words = list(doc_to_words(text_corpus)) 
words = remove_stopwords(words)
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

corpus, lda_model = get_model(data_lemmatized)

from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, text_corpus)
df_topic_sents_keywords

0 documents lemmatised
500 documents lemmatised
1000 documents lemmatised
1500 documents lemmatised
2000 documents lemmatised
2500 documents lemmatised
3000 documents lemmatised
3500 documents lemmatised
4000 documents lemmatised
4500 documents lemmatised
5000 documents lemmatised
5500 documents lemmatised
6000 documents lemmatised
6500 documents lemmatised
7000 documents lemmatised
7500 documents lemmatised
8000 documents lemmatised
8500 documents lemmatised
9000 documents lemmatised
9500 documents lemmatised
10000 documents lemmatised
10500 documents lemmatised
11000 documents lemmatised
[(0,
  '0.010*"entry" + 0.010*"line" + 0.009*"write" + 0.008*"file" + '
  '0.007*"organization" + 0.006*"program" + 0.006*"section" + 0.006*"article" '
  '+ 0.005*"may" + 0.004*"bank"'),
 (1,
  '0.013*"not" + 0.013*"god" + 0.010*"would" + 0.010*"write" + 0.008*"say" + '
  '0.008*"do" + 0.007*"believe" + 0.007*"know" + 0.007*"organization" + '
  '0.007*"people"'),
 (2,
  '0.009*"would" + 0.007*"not" + 0.006*"government" + 0.006*"write" + '
  '0.005*"line" + 0.005*"gun" + 0.005*"law" + 0.005*"organization" + '
  '0.005*"key" + 0.005*"people"'),
 (3,
  '0.011*"line" + 0.009*"organization" + 0.009*"university" + '
  '0.008*"accelerator" + 0.007*"speed" + 0.007*"period" + 0.006*"mhz" + '
  '0.005*"sc" + 0.005*"get" + 0.005*"write"'),
 (4,
  '0.704*"ax" + 0.050*"max" + 0.004*"bhj" + 0.003*"_" + 0.003*"ey" + '
  '0.003*"giz" + 0.003*"rlk" + 0.003*"tm" + 0.003*"qax" + 0.002*"qq"'),
 (5,
  '0.019*"line" + 0.016*"organization" + 0.009*"write" + 0.009*"post" + '
  '0.008*"article" + 0.007*"host" + 0.007*"nntp" + 0.007*"do" + 0.007*"would" '
  '+ 0.007*"university"'),
 (6,
  '0.021*"god" + 0.017*"christian" + 0.011*"not" + 0.011*"church" + '
  '0.009*"law" + 0.009*"bible" + 0.008*"jesus" + 0.008*"truth" + 0.007*"say" + '
  '0.007*"do"'),
 (7,
  '0.015*"god" + 0.011*"atheist" + 0.010*"say" + 0.009*"not" + 0.008*"man" + '
  '0.008*"believe" + 0.007*"atheism" + 0.007*"write" + 0.007*"exist" + '
  '0.006*"do"'),
 (8,
  '0.009*"mouse" + 0.008*"not" + 0.007*"write" + 0.006*"article" + '
  '0.006*"line" + 0.006*"organization" + 0.005*"get" + 0.005*"would" + '
  '0.004*"use" + 0.004*"do"'),
 (9,
  '0.017*"printer" + 0.011*"line" + 0.010*"point" + 0.008*"polygon" + '
  '0.007*"print" + 0.007*"organization" + 0.006*"driver" + 0.005*"canon" + '
  '0.005*"write" + 0.005*"article"'),
 (10,
  '0.010*"armenian" + 0.007*"turkish" + 0.006*"people" + 0.006*"greek" + '
  '0.006*"state" + 0.005*"say" + 0.005*"turk" + 0.004*"war" + 0.004*"write" + '
  '0.004*"government"'),
 (11,
  '0.029*"_" + 0.023*"space" + 0.010*"c" + 0.009*"cx" + 0.009*"nasa" + '
  '0.007*"orbit" + 0.006*"mission" + 0.006*"organization" + 0.006*"earth" + '
  '0.005*"satellite"'),
 (12,
  '0.018*"window" + 0.013*"file" + 0.009*"line" + 0.009*"use" + 0.008*"image" '
  '+ 0.008*"program" + 0.007*"run" + 0.006*"server" + 0.006*"system" + '
  '0.006*"write"'),
 (13,
  '0.021*"not" + 0.013*"do" + 0.010*"write" + 0.009*"line" + 0.009*"be" + '
  '0.008*"article" + 0.007*"think" + 0.007*"organization" + 0.007*"would" + '
  '0.006*"go"'),
 (14,
  '0.020*"not" + 0.012*"do" + 0.010*"be" + 0.009*"go" + 0.009*"say" + '
  '0.009*"people" + 0.008*"get" + 0.008*"would" + 0.007*"think" + '
  '0.007*"know"'),
 (15,
  '0.019*"game" + 0.018*"team" + 0.010*"line" + 0.010*"organization" + '
  '0.010*"play" + 0.008*"year" + 0.008*"hockey" + 0.008*"win" + 0.007*"get" + '
  '0.007*"season"'),
 (16,
  '0.012*"battery" + 0.009*"expose" + 0.008*"event" + 0.007*"new" + '
  '0.006*"gainey" + 0.006*"easter" + 0.005*"odometer" + 0.004*"organization" + '
  '0.004*"scope" + 0.004*"cycle"'),
 (17,
  '0.023*"organization" + 0.021*"line" + 0.020*"post" + 0.016*"nntp" + '
  '0.015*"host" + 0.012*"university" + 0.010*"write" + 0.009*"article" + '
  '0.007*"distribution" + 0.007*"mail"'),
 (18,
  '0.012*"line" + 0.011*"organization" + 0.009*"not" + 0.008*"drive" + '
  '0.007*"use" + 0.007*"system" + 0.007*"post" + 0.007*"do" + 0.007*"key" + '
  '0.006*"get"'),
 (19,
  '0.012*"israel" + 0.011*"israeli" + 0.007*"would" + 0.007*"right" + '
  '0.007*"say" + 0.007*"human" + 0.006*"not" + 0.006*"arab" + 0.006*"people" + '
  '0.006*"christian"')]

# import re
# text_corpus = df.content.values.tolist()
# text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing email addresses
# text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline characters
# text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quote characters
 
# print(text_corpus[1])

# import gensim
# import warnings
# warnings.simplefilter("ignore", DeprecationWarning)
 
# def doc_to_words(sentences):
#     for sentence in sentences:
#         yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# words = list(doc_to_words(text_corpus)) 
# print(words[1])

# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
 
# def remove_stopwords(text):
#     return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in text_corpus]
 
# words = remove_stopwords(words)
 
# print(words[1])

['guy', 'kuo', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'souls', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experiences', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experiences', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'cards', 'adapters', 'heat', 'sinks', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'floppies', 'especially', 'requested', 'summarizing', 'next', 'two', 'days', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks', 'guy', 'kuo']

import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
 
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
 
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

0 documents lemmatised
500 documents lemmatised
1000 documents lemmatised
1500 documents lemmatised
2000 documents lemmatised
2500 documents lemmatised
3000 documents lemmatised
3500 documents lemmatised
4000 documents lemmatised
4500 documents lemmatised
5000 documents lemmatised
5500 documents lemmatised
6000 documents lemmatised
6500 documents lemmatised
7000 documents lemmatised
7500 documents lemmatised
8000 documents lemmatised
8500 documents lemmatised
9000 documents lemmatised
9500 documents lemmatised
10000 documents lemmatised
10500 documents lemmatised
11000 documents lemmatised

# Create Dictionary
import gensim.corpora as corpora
id2word = corpora.Dictionary(data_lemmatized)
 
# Create Corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
 
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           per_word_topics=True)

from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.013*"drive" + 0.012*"car" + 0.012*"not" + 0.009*"line" + '
  '0.008*"organization" + 0.007*"write" + 0.007*"article" + 0.007*"do" + '
  '0.006*"be" + 0.006*"light"'),
 (1,
  '0.695*"ax" + 0.048*"max" + 0.009*"_" + 0.004*"rlk" + 0.003*"bhj" + '
  '0.003*"ey" + 0.003*"giz" + 0.002*"qax" + 0.002*"tm" + 0.002*"chz"'),
 (2,
  '0.027*"key" + 0.015*"encryption" + 0.014*"clipper" + 0.014*"chip" + '
  '0.009*"government" + 0.008*"security" + 0.007*"escrow" + 0.007*"system" + '
  '0.007*"would" + 0.007*"public"'),
 (3,
  '0.008*"state" + 0.008*"not" + 0.006*"law" + 0.006*"write" + '
  '0.006*"armenian" + 0.005*"government" + 0.005*"line" + 0.005*"would" + '
  '0.004*"exist" + 0.004*"question"'),
 (4,
  '0.014*"not" + 0.010*"do" + 0.008*"would" + 0.008*"line" + 0.008*"write" + '
  '0.007*"year" + 0.007*"be" + 0.007*"get" + 0.007*"organization" + '
  '0.006*"time"'),
 (5,
  '0.017*"key" + 0.015*"bit" + 0.012*"line" + 0.012*"_" + 0.009*"organization" '
  '+ 0.008*"use" + 0.008*"number" + 0.008*"serial" + 0.008*"window" + '
  '0.008*"c"'),
 (6,
  '0.030*"space" + 0.012*"nasa" + 0.011*"orbit" + 0.010*"mission" + '
  '0.009*"mar" + 0.007*"earth" + 0.007*"satellite" + 0.006*"spacecraft" + '
  '0.006*"shuttle" + 0.006*"probe"'),
 (7,
  '0.013*"game" + 0.012*"hockey" + 0.010*"cub" + 0.009*"league" + '
  '0.008*"division" + 0.008*"db" + 0.007*"lose" + 0.006*"hawk" + 0.006*"line" '
  '+ 0.006*"min"'),
 (8,
  '0.015*"not" + 0.010*"say" + 0.009*"do" + 0.009*"people" + 0.008*"would" + '
  '0.007*"be" + 0.007*"write" + 0.007*"think" + 0.007*"god" + 0.007*"know"'),
 (9,
  '0.013*"greek" + 0.010*"article" + 0.009*"write" + 0.009*"organization" + '
  '0.009*"get" + 0.009*"line" + 0.008*"not" + 0.007*"car" + 0.007*"greece" + '
  '0.007*"dealer"'),
 (10,
  '0.014*"wire" + 0.011*"ground" + 0.008*"ax" + 0.008*"line" + '
  '0.008*"organization" + 0.007*"outlet" + 0.006*"post" + 0.006*"neutral" + '
  '0.006*"wiring" + 0.006*"write"'),
 (11,
  '0.029*"bike" + 0.014*"line" + 0.013*"dod" + 0.012*"organization" + '
  '0.012*"motorcycle" + 0.012*"ride" + 0.011*"rider" + 0.008*"post" + '
  '0.007*"nntp" + 0.007*"host"'),
 (12,
  '0.021*"not" + 0.012*"do" + 0.010*"write" + 0.008*"be" + 0.008*"line" + '
  '0.007*"get" + 0.007*"would" + 0.007*"drive" + 0.007*"scsi" + '
  '0.007*"article"'),
 (13,
  '0.010*"write" + 0.008*"line" + 0.008*"not" + 0.008*"article" + '
  '0.008*"organization" + 0.006*"would" + 0.006*"israel" + 0.006*"do" + '
  '0.006*"state" + 0.006*"right"'),
 (14,
  '0.015*"gun" + 0.012*"game" + 0.011*"team" + 0.008*"not" + 0.008*"play" + '
  '0.007*"get" + 0.007*"line" + 0.007*"organization" + 0.007*"year" + '
  '0.006*"go"'),
 (15,
  '0.007*"mail" + 0.007*"_" + 0.006*"file" + 0.006*"cx" + 0.006*"ripem" + '
  '0.006*"line" + 0.006*"list" + 0.006*"information" + 0.005*"post" + '
  '0.005*"available"'),
 (16,
  '0.014*"line" + 0.014*"drive" + 0.012*"organization" + 0.010*"card" + '
  '0.009*"university" + 0.008*"disk" + 0.008*"driver" + 0.008*"post" + '
  '0.008*"video" + 0.008*"write"'),
 (17,
  '0.011*"new" + 0.011*"line" + 0.010*"organization" + 0.009*"gm" + '
  '0.008*"university" + 0.008*"not" + 0.007*"car" + 0.006*"write" + '
  '0.006*"good" + 0.005*"think"'),
 (18,
  '0.020*"line" + 0.016*"organization" + 0.011*"post" + 0.011*"file" + '
  '0.010*"host" + 0.009*"nntp" + 0.008*"program" + 0.008*"window" + '
  '0.007*"university" + 0.007*"get"'),
 (19,
  '0.010*"not" + 0.007*"do" + 0.007*"get" + 0.006*"window" + '
  '0.005*"organization" + 0.005*"be" + 0.005*"line" + 0.005*"space" + '
  '0.005*"would" + 0.005*"use"')]

# # Visualize the topics
# import pyLDAvis
# import pyLDAvis.gensim  
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis

def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=text_corpus):
    # Array of top 10 topics
    top10array = []
 
    for row in range(ldamodel.num_topics):
        wp = ldamodel.show_topic(row)
        topic_keywords = ", ".join([word for word, prop in wp])
        top10array.append((row+1, topic_keywords))
 
    top10dict = dict(top10array)
 
    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) for topic in ldamodel[corpus]])[0])
    sent_topics_df.columns=["Data"]
    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
 
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
    return(sent_topics_df)
 
df_topic_sents_keywords = format_topics_sentences()

# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()
 
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
 
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)
 
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
 
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
 
# Show
sent_topics_sorteddf_mallet.head()

	Dominant_Topic	Perc_Contribution	Topic_Keywords	Text
0	15	0.6508	not, do, be, go, say, people, get, would, thin...	From: (wheres my thing) Subject: WHAT car is t...
1	19	0.4511	line, organization, not, drive, use, system, p...	From: (Guy Kuo) Subject: SI Clock Poll - Final...
2	15	0.3995	not, do, be, go, say, people, get, would, thin...	From: (Thomas E Willis) Subject: PB questions....
3	18	0.4821	organization, line, post, nntp, host, universi...	From: (Joe Green) Subject: Re: Weitek P9000 ? ...
4	19	0.7449	line, organization, not, drive, use, system, p...	From: (Jonathan McDowell) Subject: Re: Shuttle...
...	...	...	...	...
11309	3	0.4866	would, not, government, write, line, gun, law,...	From: (Jim Zisfein) Subject: Re: Migraines and...
11310	19	0.7467	line, organization, not, drive, use, system, p...	From: Subject: Screen Death: Mac Plus/512 Line...
11311	19	0.6690	line, organization, not, drive, use, system, p...	From: (Will Estes) Subject: Mounting CPU Coole...
11312	10	0.7848	printer, line, point, polygon, print, organiza...	From: (Steven Collins) Subject: Re: Sphere fro...
11313	18	0.7186	organization, line, post, nntp, host, universi...	From: (Kevin J. Gunning) Subject: stolen CBR90...