LDA Clean

Tutorial Here

OVERVIEW

1. GET the data
2. CLEAN the data with regex
3. TOKENIZE the data
4. REMOVE stopwords from the data
In [16]:
import pandas as pd
def get_data(url):
    df = pd.read_json(url)
    return df

def do_some_eda(df):
    print(df.target_names.unique())
    df.head()

import re
def clean_data(df):
    
    text_corpus = df.content.values.tolist()
    text_corpus = [re.sub('(<|</)([A-Z])\w+>', '', doc) for doc in text_corpus] #removing things between <>
    text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline 
    text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quotes
    return text_corpus

import gensim
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
 
def doc_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

        
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
 
def remove_stopwords(text):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in text_corpus]
 
    
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
 
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    print(len(texts))
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
 
# Create Dictionary
import gensim.corpora as corpora
def get_model(data_lemmatized):
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    corpus = [id2word.doc2bow(text) for text in data_lemmatized]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=20, 
                                               per_word_topics=True)
    return corpus, lda_model


def format_topics_sentences(ldamodel, corpus, texts):
    # Array of top 10 topics
    top10array = []
 
    for row in range(ldamodel.num_topics):
        wp = ldamodel.show_topic(row)
        topic_keywords = ", ".join([word for word, prop in wp])
        top10array.append((row+1, topic_keywords))
 
    top10dict = dict(top10array)
 
    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) 
                                                for topic in ldamodel[corpus]])[0])
    sent_topics_df.columns=["Data"]
    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
 
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
    return(sent_topics_df)
In [17]:
# def do_the_thing():
# df = get_data('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file,  encoding = "ISO-8859-1")
        results.append(f.read())
        f.close()
    return results


# DATA SET 2
data_fd = get_data_from_files('110/110-f-d/')
data_fr = get_data_from_files('110/110-f-r/')
data_md = get_data_from_files('110/110-m-d/')
data_mr = get_data_from_files('110/110-m-r/')

female_data = data_fd + data_fr 
male_data = data_md + data_mr
dem_data = data_md + data_fd
rep_data = data_mr + data_fr

all_data = female_data + male_data

# DATA SET 2 -- SMALL
female_data_sm = data_fd[:10] + data_fr[:10] 
male_data_sm = data_md[:10] + data_mr[:10]
dem_data = data_md[:10] + data_fd[:10]
rep_data = data_mr[:10] + data_fr[:10]

all_data = female_data_sm + male_data_sm
df = pd.DataFrame({'content': all_data})
df.head()
Out[17]:
content
0 <DOC>\n<DOCNO>Mrs. JONES of Ohio. (PERSONAL EX...
1 <DOC>\n<DOCNO>Ms. ROS-LEHTINEN. (TOM LANTOS AN...
2 <DOC>\n<DOCNO>Ms. WATERS. (PROVIDING FOR CONSI...
3 <DOC>\n<DOCNO>Mrs. DAVIS of California. (PROVI...
4 <DOC>\n<DOCNO>Mrs. NAPOLITANO. (PASSENGER RAIL...
In [19]:
# do_some_eda(df)
text_corpus = clean_data(df)
words = list(doc_to_words(text_corpus)) 

words = remove_stopwords(words)
# words
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

corpus, lda_model = get_model(data_lemmatized)

from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, text_corpus)
df_topic_sents_keywords
40
0 documents lemmatised
[(0,
  '0.008*"bill" + 0.008*"house" + 0.007*"say" + 0.007*"speaker" + 0.006*"year" '
  '+ 0.006*"support" + 0.006*"time" + 0.006*"would" + 0.006*"work" + '
  '0.005*"go"'),
 (1,
  '0.009*"bill" + 0.009*"house" + 0.008*"go" + 0.007*"would" + 0.007*"year" + '
  '0.006*"time" + 0.006*"not" + 0.006*"american" + 0.006*"speaker" + '
  '0.006*"representative"'),
 (2,
  '0.008*"house" + 0.008*"speaker" + 0.007*"would" + 0.007*"time" + '
  '0.007*"support" + 0.006*"year" + 0.006*"bill" + 0.006*"go" + 0.006*"make" + '
  '0.006*"act"'),
 (3,
  '0.009*"go" + 0.009*"representative" + 0.008*"house" + 0.008*"bill" + '
  '0.008*"people" + 0.007*"american" + 0.007*"say" + 0.007*"speaker" + '
  '0.007*"make" + 0.006*"year"'),
 (4,
  '0.009*"speaker" + 0.008*"house" + 0.008*"go" + 0.007*"make" + 0.006*"time" '
  '+ 0.006*"bill" + 0.006*"people" + 0.006*"american" + 0.006*"say" + '
  '0.006*"not"'),
 (5,
  '0.010*"house" + 0.008*"go" + 0.007*"work" + 0.007*"american" + '
  '0.007*"speaker" + 0.007*"not" + 0.007*"state" + 0.006*"year" + '
  '0.006*"representative" + 0.006*"would"'),
 (6,
  '0.009*"go" + 0.008*"american" + 0.007*"bill" + 0.007*"house" + 0.007*"say" '
  '+ 0.006*"speaker" + 0.006*"time" + 0.006*"make" + 0.006*"would" + '
  '0.005*"not"'),
 (7,
  '0.007*"house" + 0.007*"time" + 0.007*"go" + 0.007*"speaker" + 0.007*"would" '
  '+ 0.007*"make" + 0.006*"year" + 0.006*"representative" + 0.006*"people" + '
  '0.006*"bill"'),
 (8,
  '0.008*"house" + 0.007*"speaker" + 0.007*"people" + 0.007*"bill" + '
  '0.007*"representative" + 0.006*"american" + 0.006*"year" + 0.005*"act" + '
  '0.005*"say" + 0.005*"work"'),
 (9,
  '0.011*"house" + 0.010*"go" + 0.008*"speaker" + 0.008*"bill" + '
  '0.008*"american" + 0.008*"work" + 0.007*"make" + 0.007*"people" + '
  '0.006*"want" + 0.006*"say"'),
 (10,
  '0.010*"go" + 0.009*"house" + 0.008*"speaker" + 0.008*"bill" + 0.008*"say" + '
  '0.007*"not" + 0.007*"american" + 0.007*"think" + 0.006*"year" + '
  '0.006*"people"'),
 (11,
  '0.008*"house" + 0.007*"say" + 0.007*"bill" + 0.007*"would" + 0.007*"go" + '
  '0.006*"speaker" + 0.006*"people" + 0.006*"state" + 0.006*"american" + '
  '0.006*"time"'),
 (12,
  '0.009*"house" + 0.007*"bill" + 0.006*"act" + 0.006*"say" + 0.006*"go" + '
  '0.006*"state" + 0.006*"american" + 0.006*"representative" + 0.006*"year" + '
  '0.006*"people"'),
 (13,
  '0.008*"house" + 0.007*"go" + 0.007*"speaker" + 0.007*"american" + '
  '0.007*"would" + 0.006*"representative" + 0.006*"bill" + 0.006*"people" + '
  '0.006*"year" + 0.006*"state"'),
 (14,
  '0.012*"bill" + 0.008*"house" + 0.008*"speaker" + 0.007*"go" + 0.007*"would" '
  '+ 0.006*"representative" + 0.006*"year" + 0.006*"time" + 0.005*"people" + '
  '0.005*"american"'),
 (15,
  '0.009*"house" + 0.009*"speaker" + 0.009*"say" + 0.008*"go" + 0.007*"time" + '
  '0.007*"american" + 0.006*"people" + 0.006*"not" + 0.006*"state" + '
  '0.006*"want"'),
 (16,
  '0.009*"bill" + 0.008*"house" + 0.007*"go" + 0.007*"would" + 0.006*"people" '
  '+ 0.006*"speaker" + 0.005*"time" + 0.005*"act" + 0.005*"representative" + '
  '0.005*"american"'),
 (17,
  '0.008*"house" + 0.008*"speaker" + 0.007*"bill" + 0.007*"would" + '
  '0.006*"time" + 0.006*"american" + 0.006*"go" + 0.006*"act" + 0.005*"work" + '
  '0.005*"year"'),
 (18,
  '0.009*"house" + 0.008*"go" + 0.008*"speaker" + 0.006*"representative" + '
  '0.006*"support" + 0.006*"people" + 0.006*"say" + 0.006*"year" + '
  '0.006*"work" + 0.006*"bill"'),
 (19,
  '0.009*"bill" + 0.007*"representative" + 0.007*"house" + 0.007*"congress" + '
  '0.007*"american" + 0.006*"state" + 0.006*"go" + 0.006*"support" + '
  '0.006*"year" + 0.006*"people"')]
Out[19]:
Dominant_Topic Perc_Contribution Topic_Keywords Text
0 1 0.5471 bill, house, say, speaker, year, support, time... Mrs. JONES of Ohio. (PERSONAL EXPLANATION -- ...
1 1 0.2777 bill, house, say, speaker, year, support, time... Ms. ROS-LEHTINEN. (TOM LANTOS AND HENRY J. HY...
2 10 0.3945 house, go, speaker, bill, american, work, make... Ms. WATERS. (PROVIDING FOR CONSIDERATION OF S...
3 6 0.3046 house, go, work, american, speaker, not, state... Mrs. DAVIS of California. (PROVIDING FOR CONS...
4 13 0.6177 house, bill, act, say, go, state, american, re... Mrs. NAPOLITANO. (PASSENGER RAIL INVESTMENT A...
5 1 0.2588 bill, house, say, speaker, year, support, time... Ms. SCHWARTZ. (WELCOMING MEMBERS OF PARLIAMEN...
6 1 0.6765 bill, house, say, speaker, year, support, time... Ms. BERKLEY. (RECOVERY REBATES AND ECONOMIC S...
7 2 0.3375 bill, house, go, would, year, time, not, ameri... Ms. ESHOO. (CONFERENCE REPORT ON H.R. 4040, C...
8 6 0.2586 house, go, work, american, speaker, not, state... Ms. SCHAKOWSKY. (PROTECT AMERICA ACT OF 2007 ...
9 1 0.5075 bill, house, say, speaker, year, support, time... Ms. GIFFORDS. (EMPLOYEE VERIFICATION AMENDMEN...
10 1 0.4394 bill, house, say, speaker, year, support, time... Ms. PRYCE of Ohio. (SUPPORTING EFFORTS TO INC...
11 6 0.7472 house, go, work, american, speaker, not, state... Mrs. SCHMIDT. (HONORING THE LIFE OF PATRICIA ...
12 6 0.2564 house, go, work, american, speaker, not, state... Mrs. BIGGERT. (RECOVERY REBATES AND ECONOMIC ...
13 2 0.4394 bill, house, go, would, year, time, not, ameri... Ms. GRANGER. (SHOSHONE-PAIUTE TRIBES OF THE D...
14 11 0.5507 go, house, speaker, bill, say, not, american, ... Mrs. EMERSON. (WILLIAM ``BILL CLAY POST OFFIC...
15 11 0.5633 go, house, speaker, bill, say, not, american, ... Mrs. BLACKBURN. (THE ENERGY PROBLEM IS ONE WE...
16 13 0.2226 house, bill, act, say, go, state, american, re... Mrs. CAPITO. (WE CANNOT ALLOW OUR DOMESTIC EN...
17 13 0.4348 house, bill, act, say, go, state, american, re... Ms. FOXX. (D&D DISPLAYS INNOVATES IN NORTH WI...
18 1 0.4321 bill, house, say, speaker, year, support, time... Mrs. JO ANN DAVIS of Virginia. (COMMEMORATING...
19 6 0.4760 house, go, work, american, speaker, not, state... Mrs. BONO. (HONORING JACK VALENTI -- (House o...
20 17 0.5430 bill, house, go, would, people, speaker, time,... Mr. BERRY. (RELATING TO THE HOUSE PROCEDURES ...
21 13 0.2873 house, bill, act, say, go, state, american, re... Mr. McNERNEY. (PROVIDING FOR CONSIDERATION OF...
22 6 0.3873 house, go, work, american, speaker, not, state... Mr. ANDREWS. (CONDEMNING THE PERSECUTION OF L...
23 17 0.4443 bill, house, go, would, people, speaker, time,... Mr. POMEROY. (RELATING TO THE HOUSE PROCEDURE...
24 11 0.2772 go, house, speaker, bill, say, not, american, ... Mr. BLUMENAUER. (RECOVERY REBATES AND ECONOMI...
25 8 0.2865 house, time, go, speaker, would, make, year, r... Mr. PATRICK J. MURPHY of Pennsylvania. (EMPLO...
26 10 0.5393 house, go, speaker, bill, american, work, make... Mr. MEEK of Florida. (30-SOMETHING WORKING GR...
27 2 0.2703 bill, house, go, would, year, time, not, ameri... Mr. REYES. (NATIONAL ENERGY SECURITY INTELLIG...
28 6 0.4751 house, go, work, american, speaker, not, state... Mr. HARE. (RECOVERY REBATES AND ECONOMIC STIM...
29 17 0.3969 bill, house, go, would, people, speaker, time,... Mr. SIRES. (TOM LANTOS AND HENRY J. HYDE UNIT...
30 11 0.6178 go, house, speaker, bill, say, not, american, ... Mr. PITTS. (EXELON -- (House of Representativ...
31 2 0.5347 bill, house, go, would, year, time, not, ameri... Mr. MORAN of Kansas. (RECOGNIZING THE SPECIAL...
32 2 0.2713 bill, house, go, would, year, time, not, ameri... Mr. SALI. (HIGH ENERGY PRICES -- (House of Re...
33 11 0.3862 go, house, speaker, bill, say, not, american, ... Mr. McCARTHY of California. (REPUBLICAN FRESH...
34 2 0.7438 bill, house, go, would, year, time, not, ameri... Mr. TIAHRT. (EMPLOYEE FREE CHOICE ACT -- (Hou...
35 13 0.4265 house, bill, act, say, go, state, american, re... Mr. PLATTS. (FREEDOM OF INFORMATION ACT AMEND...
36 17 0.3548 bill, house, go, would, people, speaker, time,... Mr. SMITH of Nebraska. (EXPAND OUR NATIONS EX...
37 13 0.4737 house, bill, act, say, go, state, american, re... Mr. HALL of Texas. (PRODUCED WATER UTILIZATIO...
38 1 0.9203 bill, house, say, speaker, year, support, time... Mr. RENZI. (HAWAIIAN HOMEOWNERSHIP OPPORTUNIT...
39 11 0.2845 go, house, speaker, bill, say, not, american, ... Mr. FOSSELLA. (TO ELIMINATE THE EXEMPTION FRO...
In [ ]:
# # Visualize the topics
# import pyLDAvis
# import pyLDAvis.gensim  
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis
In [ ]:
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()
 
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
 
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
 
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
 
# Show
sent_topics_sorteddf_mallet.head()
In [ ]: