LDA Clean

Tutorial Here

OVERVIEW

1. GET the data
2. CLEAN the data with regex
3. TOKENIZE the data
4. REMOVE stopwords from the data
In [7]:
import pandas as pd
def get_data(url):
    df = pd.read_json(url)
    return df

def do_some_eda(df):
    print(df.target_names.unique())
    df.head()

import re
def clean_data(df):
    text_corpus = df.content.values.tolist()
    text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing emails
    text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline 
    text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quotes
    return text_corpus

import gensim
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
 
def doc_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

        
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
 
def remove_stopwords(text):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in text_corpus]
 
    
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
 
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
 
# Create Dictionary
import gensim.corpora as corpora
def get_model(data_lemmatized):
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    corpus = [id2word.doc2bow(text) for text in data_lemmatized]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=20, 
                                               per_word_topics=True)
    return corpus, lda_model


def format_topics_sentences(ldamodel, corpus, texts):
    # Array of top 10 topics
    top10array = []
 
    for row in range(ldamodel.num_topics):
        wp = ldamodel.show_topic(row)
        topic_keywords = ", ".join([word for word, prop in wp])
        top10array.append((row+1, topic_keywords))
 
    top10dict = dict(top10array)
 
    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) 
                                                for topic in ldamodel[corpus]])[0])
    sent_topics_df.columns=["Data"]
    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
 
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
    return(sent_topics_df)
 


# def do_the_thing():
df = get_data('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
# do_some_eda(df)
text_corpus = clean_data(df)
words = list(doc_to_words(text_corpus)) 
words = remove_stopwords(words)
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

corpus, lda_model = get_model(data_lemmatized)

from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, text_corpus)
df_topic_sents_keywords
0 documents lemmatised
500 documents lemmatised
1000 documents lemmatised
1500 documents lemmatised
2000 documents lemmatised
2500 documents lemmatised
3000 documents lemmatised
3500 documents lemmatised
4000 documents lemmatised
4500 documents lemmatised
5000 documents lemmatised
5500 documents lemmatised
6000 documents lemmatised
6500 documents lemmatised
7000 documents lemmatised
7500 documents lemmatised
8000 documents lemmatised
8500 documents lemmatised
9000 documents lemmatised
9500 documents lemmatised
10000 documents lemmatised
10500 documents lemmatised
11000 documents lemmatised
[(0,
  '0.010*"entry" + 0.010*"line" + 0.009*"write" + 0.008*"file" + '
  '0.007*"organization" + 0.006*"program" + 0.006*"section" + 0.006*"article" '
  '+ 0.005*"may" + 0.004*"bank"'),
 (1,
  '0.013*"not" + 0.013*"god" + 0.010*"would" + 0.010*"write" + 0.008*"say" + '
  '0.008*"do" + 0.007*"believe" + 0.007*"know" + 0.007*"organization" + '
  '0.007*"people"'),
 (2,
  '0.009*"would" + 0.007*"not" + 0.006*"government" + 0.006*"write" + '
  '0.005*"line" + 0.005*"gun" + 0.005*"law" + 0.005*"organization" + '
  '0.005*"key" + 0.005*"people"'),
 (3,
  '0.011*"line" + 0.009*"organization" + 0.009*"university" + '
  '0.008*"accelerator" + 0.007*"speed" + 0.007*"period" + 0.006*"mhz" + '
  '0.005*"sc" + 0.005*"get" + 0.005*"write"'),
 (4,
  '0.704*"ax" + 0.050*"max" + 0.004*"bhj" + 0.003*"_" + 0.003*"ey" + '
  '0.003*"giz" + 0.003*"rlk" + 0.003*"tm" + 0.003*"qax" + 0.002*"qq"'),
 (5,
  '0.019*"line" + 0.016*"organization" + 0.009*"write" + 0.009*"post" + '
  '0.008*"article" + 0.007*"host" + 0.007*"nntp" + 0.007*"do" + 0.007*"would" '
  '+ 0.007*"university"'),
 (6,
  '0.021*"god" + 0.017*"christian" + 0.011*"not" + 0.011*"church" + '
  '0.009*"law" + 0.009*"bible" + 0.008*"jesus" + 0.008*"truth" + 0.007*"say" + '
  '0.007*"do"'),
 (7,
  '0.015*"god" + 0.011*"atheist" + 0.010*"say" + 0.009*"not" + 0.008*"man" + '
  '0.008*"believe" + 0.007*"atheism" + 0.007*"write" + 0.007*"exist" + '
  '0.006*"do"'),
 (8,
  '0.009*"mouse" + 0.008*"not" + 0.007*"write" + 0.006*"article" + '
  '0.006*"line" + 0.006*"organization" + 0.005*"get" + 0.005*"would" + '
  '0.004*"use" + 0.004*"do"'),
 (9,
  '0.017*"printer" + 0.011*"line" + 0.010*"point" + 0.008*"polygon" + '
  '0.007*"print" + 0.007*"organization" + 0.006*"driver" + 0.005*"canon" + '
  '0.005*"write" + 0.005*"article"'),
 (10,
  '0.010*"armenian" + 0.007*"turkish" + 0.006*"people" + 0.006*"greek" + '
  '0.006*"state" + 0.005*"say" + 0.005*"turk" + 0.004*"war" + 0.004*"write" + '
  '0.004*"government"'),
 (11,
  '0.029*"_" + 0.023*"space" + 0.010*"c" + 0.009*"cx" + 0.009*"nasa" + '
  '0.007*"orbit" + 0.006*"mission" + 0.006*"organization" + 0.006*"earth" + '
  '0.005*"satellite"'),
 (12,
  '0.018*"window" + 0.013*"file" + 0.009*"line" + 0.009*"use" + 0.008*"image" '
  '+ 0.008*"program" + 0.007*"run" + 0.006*"server" + 0.006*"system" + '
  '0.006*"write"'),
 (13,
  '0.021*"not" + 0.013*"do" + 0.010*"write" + 0.009*"line" + 0.009*"be" + '
  '0.008*"article" + 0.007*"think" + 0.007*"organization" + 0.007*"would" + '
  '0.006*"go"'),
 (14,
  '0.020*"not" + 0.012*"do" + 0.010*"be" + 0.009*"go" + 0.009*"say" + '
  '0.009*"people" + 0.008*"get" + 0.008*"would" + 0.007*"think" + '
  '0.007*"know"'),
 (15,
  '0.019*"game" + 0.018*"team" + 0.010*"line" + 0.010*"organization" + '
  '0.010*"play" + 0.008*"year" + 0.008*"hockey" + 0.008*"win" + 0.007*"get" + '
  '0.007*"season"'),
 (16,
  '0.012*"battery" + 0.009*"expose" + 0.008*"event" + 0.007*"new" + '
  '0.006*"gainey" + 0.006*"easter" + 0.005*"odometer" + 0.004*"organization" + '
  '0.004*"scope" + 0.004*"cycle"'),
 (17,
  '0.023*"organization" + 0.021*"line" + 0.020*"post" + 0.016*"nntp" + '
  '0.015*"host" + 0.012*"university" + 0.010*"write" + 0.009*"article" + '
  '0.007*"distribution" + 0.007*"mail"'),
 (18,
  '0.012*"line" + 0.011*"organization" + 0.009*"not" + 0.008*"drive" + '
  '0.007*"use" + 0.007*"system" + 0.007*"post" + 0.007*"do" + 0.007*"key" + '
  '0.006*"get"'),
 (19,
  '0.012*"israel" + 0.011*"israeli" + 0.007*"would" + 0.007*"right" + '
  '0.007*"say" + 0.007*"human" + 0.006*"not" + 0.006*"arab" + 0.006*"people" + '
  '0.006*"christian"')]
Out[7]:
Dominant_Topic Perc_Contribution Topic_Keywords Text
0 15 0.6508 not, do, be, go, say, people, get, would, thin... From: (wheres my thing) Subject: WHAT car is t...
1 19 0.4511 line, organization, not, drive, use, system, p... From: (Guy Kuo) Subject: SI Clock Poll - Final...
2 15 0.3995 not, do, be, go, say, people, get, would, thin... From: (Thomas E Willis) Subject: PB questions....
3 18 0.4821 organization, line, post, nntp, host, universi... From: (Joe Green) Subject: Re: Weitek P9000 ? ...
4 19 0.7449 line, organization, not, drive, use, system, p... From: (Jonathan McDowell) Subject: Re: Shuttle...
... ... ... ... ...
11309 3 0.4866 would, not, government, write, line, gun, law,... From: (Jim Zisfein) Subject: Re: Migraines and...
11310 19 0.7467 line, organization, not, drive, use, system, p... From: Subject: Screen Death: Mac Plus/512 Line...
11311 19 0.6690 line, organization, not, drive, use, system, p... From: (Will Estes) Subject: Mounting CPU Coole...
11312 10 0.7848 printer, line, point, polygon, print, organiza... From: (Steven Collins) Subject: Re: Sphere fro...
11313 18 0.7186 organization, line, post, nntp, host, universi... From: (Kevin J. Gunning) Subject: stolen CBR90...

11314 rows × 4 columns

In [3]:
# import re
# text_corpus = df.content.values.tolist()
# text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing email addresses
# text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline characters
# text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quote characters
 
# print(text_corpus[1])
In [4]:
# import gensim
# import warnings
# warnings.simplefilter("ignore", DeprecationWarning)
 
# def doc_to_words(sentences):
#     for sentence in sentences:
#         yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

# words = list(doc_to_words(text_corpus)) 
# print(words[1])
In [19]:
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
 
# def remove_stopwords(text):
#     return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in text_corpus]
 
# words = remove_stopwords(words)
 
# print(words[1])
['guy', 'kuo', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'souls', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experiences', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experiences', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'cards', 'adapters', 'heat', 'sinks', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'floppies', 'especially', 'requested', 'summarizing', 'next', 'two', 'days', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks', 'guy', 'kuo']
In [20]:
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
 
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
 
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
0 documents lemmatised
500 documents lemmatised
1000 documents lemmatised
1500 documents lemmatised
2000 documents lemmatised
2500 documents lemmatised
3000 documents lemmatised
3500 documents lemmatised
4000 documents lemmatised
4500 documents lemmatised
5000 documents lemmatised
5500 documents lemmatised
6000 documents lemmatised
6500 documents lemmatised
7000 documents lemmatised
7500 documents lemmatised
8000 documents lemmatised
8500 documents lemmatised
9000 documents lemmatised
9500 documents lemmatised
10000 documents lemmatised
10500 documents lemmatised
11000 documents lemmatised
In [25]:
# Create Dictionary
import gensim.corpora as corpora
id2word = corpora.Dictionary(data_lemmatized)
 
# Create Corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
 
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           per_word_topics=True)
In [26]:
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
[(0,
  '0.013*"drive" + 0.012*"car" + 0.012*"not" + 0.009*"line" + '
  '0.008*"organization" + 0.007*"write" + 0.007*"article" + 0.007*"do" + '
  '0.006*"be" + 0.006*"light"'),
 (1,
  '0.695*"ax" + 0.048*"max" + 0.009*"_" + 0.004*"rlk" + 0.003*"bhj" + '
  '0.003*"ey" + 0.003*"giz" + 0.002*"qax" + 0.002*"tm" + 0.002*"chz"'),
 (2,
  '0.027*"key" + 0.015*"encryption" + 0.014*"clipper" + 0.014*"chip" + '
  '0.009*"government" + 0.008*"security" + 0.007*"escrow" + 0.007*"system" + '
  '0.007*"would" + 0.007*"public"'),
 (3,
  '0.008*"state" + 0.008*"not" + 0.006*"law" + 0.006*"write" + '
  '0.006*"armenian" + 0.005*"government" + 0.005*"line" + 0.005*"would" + '
  '0.004*"exist" + 0.004*"question"'),
 (4,
  '0.014*"not" + 0.010*"do" + 0.008*"would" + 0.008*"line" + 0.008*"write" + '
  '0.007*"year" + 0.007*"be" + 0.007*"get" + 0.007*"organization" + '
  '0.006*"time"'),
 (5,
  '0.017*"key" + 0.015*"bit" + 0.012*"line" + 0.012*"_" + 0.009*"organization" '
  '+ 0.008*"use" + 0.008*"number" + 0.008*"serial" + 0.008*"window" + '
  '0.008*"c"'),
 (6,
  '0.030*"space" + 0.012*"nasa" + 0.011*"orbit" + 0.010*"mission" + '
  '0.009*"mar" + 0.007*"earth" + 0.007*"satellite" + 0.006*"spacecraft" + '
  '0.006*"shuttle" + 0.006*"probe"'),
 (7,
  '0.013*"game" + 0.012*"hockey" + 0.010*"cub" + 0.009*"league" + '
  '0.008*"division" + 0.008*"db" + 0.007*"lose" + 0.006*"hawk" + 0.006*"line" '
  '+ 0.006*"min"'),
 (8,
  '0.015*"not" + 0.010*"say" + 0.009*"do" + 0.009*"people" + 0.008*"would" + '
  '0.007*"be" + 0.007*"write" + 0.007*"think" + 0.007*"god" + 0.007*"know"'),
 (9,
  '0.013*"greek" + 0.010*"article" + 0.009*"write" + 0.009*"organization" + '
  '0.009*"get" + 0.009*"line" + 0.008*"not" + 0.007*"car" + 0.007*"greece" + '
  '0.007*"dealer"'),
 (10,
  '0.014*"wire" + 0.011*"ground" + 0.008*"ax" + 0.008*"line" + '
  '0.008*"organization" + 0.007*"outlet" + 0.006*"post" + 0.006*"neutral" + '
  '0.006*"wiring" + 0.006*"write"'),
 (11,
  '0.029*"bike" + 0.014*"line" + 0.013*"dod" + 0.012*"organization" + '
  '0.012*"motorcycle" + 0.012*"ride" + 0.011*"rider" + 0.008*"post" + '
  '0.007*"nntp" + 0.007*"host"'),
 (12,
  '0.021*"not" + 0.012*"do" + 0.010*"write" + 0.008*"be" + 0.008*"line" + '
  '0.007*"get" + 0.007*"would" + 0.007*"drive" + 0.007*"scsi" + '
  '0.007*"article"'),
 (13,
  '0.010*"write" + 0.008*"line" + 0.008*"not" + 0.008*"article" + '
  '0.008*"organization" + 0.006*"would" + 0.006*"israel" + 0.006*"do" + '
  '0.006*"state" + 0.006*"right"'),
 (14,
  '0.015*"gun" + 0.012*"game" + 0.011*"team" + 0.008*"not" + 0.008*"play" + '
  '0.007*"get" + 0.007*"line" + 0.007*"organization" + 0.007*"year" + '
  '0.006*"go"'),
 (15,
  '0.007*"mail" + 0.007*"_" + 0.006*"file" + 0.006*"cx" + 0.006*"ripem" + '
  '0.006*"line" + 0.006*"list" + 0.006*"information" + 0.005*"post" + '
  '0.005*"available"'),
 (16,
  '0.014*"line" + 0.014*"drive" + 0.012*"organization" + 0.010*"card" + '
  '0.009*"university" + 0.008*"disk" + 0.008*"driver" + 0.008*"post" + '
  '0.008*"video" + 0.008*"write"'),
 (17,
  '0.011*"new" + 0.011*"line" + 0.010*"organization" + 0.009*"gm" + '
  '0.008*"university" + 0.008*"not" + 0.007*"car" + 0.006*"write" + '
  '0.006*"good" + 0.005*"think"'),
 (18,
  '0.020*"line" + 0.016*"organization" + 0.011*"post" + 0.011*"file" + '
  '0.010*"host" + 0.009*"nntp" + 0.008*"program" + 0.008*"window" + '
  '0.007*"university" + 0.007*"get"'),
 (19,
  '0.010*"not" + 0.007*"do" + 0.007*"get" + 0.006*"window" + '
  '0.005*"organization" + 0.005*"be" + 0.005*"line" + 0.005*"space" + '
  '0.005*"would" + 0.005*"use"')]
In [ ]:
# # Visualize the topics
# import pyLDAvis
# import pyLDAvis.gensim  
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis
In [ ]:
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=text_corpus):
    # Array of top 10 topics
    top10array = []
 
    for row in range(ldamodel.num_topics):
        wp = ldamodel.show_topic(row)
        topic_keywords = ", ".join([word for word, prop in wp])
        top10array.append((row+1, topic_keywords))
 
    top10dict = dict(top10array)
 
    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) for topic in ldamodel[corpus]])[0])
    sent_topics_df.columns=["Data"]
    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
 
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
    return(sent_topics_df)
 
df_topic_sents_keywords = format_topics_sentences()
In [ ]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()
 
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
 
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)
 
# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
 
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
 
# Show
sent_topics_sorteddf_mallet.head()
In [ ]: