LDA Clean¶

OVERVIEW¶

1. GET the data
2. CLEAN the data with regex
3. TOKENIZE the data
4. REMOVE stopwords from the data

import pandas as pd
def get_data(url):
    df = pd.read_json(url)
    return df

def do_some_eda(df):
    print(df.target_names.unique())
    df.head()

import re
def clean_data(df):
    
    text_corpus = df.content.values.tolist()
    text_corpus = [re.sub('(<|</)([A-Z])\w+>', '', doc) for doc in text_corpus] #removing things between <>
    text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline 
    text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quotes
    return text_corpus

import gensim
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
 
def doc_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))

        
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
 
def remove_stopwords(text):
    return [[word for word in gensim.utils.simple_preprocess(str(doc)) 
             if word not in stop_words] for doc in text_corpus]
 
    
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
 
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    print(len(texts))
    texts_out = []
    for idx, sent in enumerate(texts):
        if (idx) % 500 == 0:
            print(str(idx) + ' documents lemmatised')
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out
 
# Create Dictionary
import gensim.corpora as corpora
def get_model(data_lemmatized):
    id2word = corpora.Dictionary(data_lemmatized)

    # Create Corpus
    corpus = [id2word.doc2bow(text) for text in data_lemmatized]

    # Build LDA model
    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                               id2word=id2word,
                                               num_topics=20, 
                                               per_word_topics=True)
    return corpus, lda_model


def format_topics_sentences(ldamodel, corpus, texts):
    # Array of top 10 topics
    top10array = []
 
    for row in range(ldamodel.num_topics):
        wp = ldamodel.show_topic(row)
        topic_keywords = ", ".join([word for word, prop in wp])
        top10array.append((row+1, topic_keywords))
 
    top10dict = dict(top10array)
 
    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) 
                                                for topic in ldamodel[corpus]])[0])
    sent_topics_df.columns=["Data"]
    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
 
    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
    return(sent_topics_df)

# def do_the_thing():
# df = get_data('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')

import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file,  encoding = "ISO-8859-1")
        results.append(f.read())
        f.close()
    return results


# DATA SET 2
data_fd = get_data_from_files('110/110-f-d/')
data_fr = get_data_from_files('110/110-f-r/')
data_md = get_data_from_files('110/110-m-d/')
data_mr = get_data_from_files('110/110-m-r/')

female_data = data_fd + data_fr 
male_data = data_md + data_mr
dem_data = data_md + data_fd
rep_data = data_mr + data_fr

all_data = female_data + male_data

# DATA SET 2 -- SMALL
female_data_sm = data_fd[:10] + data_fr[:10] 
male_data_sm = data_md[:10] + data_mr[:10]
dem_data = data_md[:10] + data_fd[:10]
rep_data = data_mr[:10] + data_fr[:10]

all_data = female_data_sm + male_data_sm
df = pd.DataFrame({'content': all_data})
df.head()

# do_some_eda(df)
text_corpus = clean_data(df)
words = list(doc_to_words(text_corpus)) 

words = remove_stopwords(words)
# words
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

corpus, lda_model = get_model(data_lemmatized)

from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, text_corpus)
df_topic_sents_keywords

40
0 documents lemmatised
[(0,
  '0.008*"bill" + 0.008*"house" + 0.007*"say" + 0.007*"speaker" + 0.006*"year" '
  '+ 0.006*"support" + 0.006*"time" + 0.006*"would" + 0.006*"work" + '
  '0.005*"go"'),
 (1,
  '0.009*"bill" + 0.009*"house" + 0.008*"go" + 0.007*"would" + 0.007*"year" + '
  '0.006*"time" + 0.006*"not" + 0.006*"american" + 0.006*"speaker" + '
  '0.006*"representative"'),
 (2,
  '0.008*"house" + 0.008*"speaker" + 0.007*"would" + 0.007*"time" + '
  '0.007*"support" + 0.006*"year" + 0.006*"bill" + 0.006*"go" + 0.006*"make" + '
  '0.006*"act"'),
 (3,
  '0.009*"go" + 0.009*"representative" + 0.008*"house" + 0.008*"bill" + '
  '0.008*"people" + 0.007*"american" + 0.007*"say" + 0.007*"speaker" + '
  '0.007*"make" + 0.006*"year"'),
 (4,
  '0.009*"speaker" + 0.008*"house" + 0.008*"go" + 0.007*"make" + 0.006*"time" '
  '+ 0.006*"bill" + 0.006*"people" + 0.006*"american" + 0.006*"say" + '
  '0.006*"not"'),
 (5,
  '0.010*"house" + 0.008*"go" + 0.007*"work" + 0.007*"american" + '
  '0.007*"speaker" + 0.007*"not" + 0.007*"state" + 0.006*"year" + '
  '0.006*"representative" + 0.006*"would"'),
 (6,
  '0.009*"go" + 0.008*"american" + 0.007*"bill" + 0.007*"house" + 0.007*"say" '
  '+ 0.006*"speaker" + 0.006*"time" + 0.006*"make" + 0.006*"would" + '
  '0.005*"not"'),
 (7,
  '0.007*"house" + 0.007*"time" + 0.007*"go" + 0.007*"speaker" + 0.007*"would" '
  '+ 0.007*"make" + 0.006*"year" + 0.006*"representative" + 0.006*"people" + '
  '0.006*"bill"'),
 (8,
  '0.008*"house" + 0.007*"speaker" + 0.007*"people" + 0.007*"bill" + '
  '0.007*"representative" + 0.006*"american" + 0.006*"year" + 0.005*"act" + '
  '0.005*"say" + 0.005*"work"'),
 (9,
  '0.011*"house" + 0.010*"go" + 0.008*"speaker" + 0.008*"bill" + '
  '0.008*"american" + 0.008*"work" + 0.007*"make" + 0.007*"people" + '
  '0.006*"want" + 0.006*"say"'),
 (10,
  '0.010*"go" + 0.009*"house" + 0.008*"speaker" + 0.008*"bill" + 0.008*"say" + '
  '0.007*"not" + 0.007*"american" + 0.007*"think" + 0.006*"year" + '
  '0.006*"people"'),
 (11,
  '0.008*"house" + 0.007*"say" + 0.007*"bill" + 0.007*"would" + 0.007*"go" + '
  '0.006*"speaker" + 0.006*"people" + 0.006*"state" + 0.006*"american" + '
  '0.006*"time"'),
 (12,
  '0.009*"house" + 0.007*"bill" + 0.006*"act" + 0.006*"say" + 0.006*"go" + '
  '0.006*"state" + 0.006*"american" + 0.006*"representative" + 0.006*"year" + '
  '0.006*"people"'),
 (13,
  '0.008*"house" + 0.007*"go" + 0.007*"speaker" + 0.007*"american" + '
  '0.007*"would" + 0.006*"representative" + 0.006*"bill" + 0.006*"people" + '
  '0.006*"year" + 0.006*"state"'),
 (14,
  '0.012*"bill" + 0.008*"house" + 0.008*"speaker" + 0.007*"go" + 0.007*"would" '
  '+ 0.006*"representative" + 0.006*"year" + 0.006*"time" + 0.005*"people" + '
  '0.005*"american"'),
 (15,
  '0.009*"house" + 0.009*"speaker" + 0.009*"say" + 0.008*"go" + 0.007*"time" + '
  '0.007*"american" + 0.006*"people" + 0.006*"not" + 0.006*"state" + '
  '0.006*"want"'),
 (16,
  '0.009*"bill" + 0.008*"house" + 0.007*"go" + 0.007*"would" + 0.006*"people" '
  '+ 0.006*"speaker" + 0.005*"time" + 0.005*"act" + 0.005*"representative" + '
  '0.005*"american"'),
 (17,
  '0.008*"house" + 0.008*"speaker" + 0.007*"bill" + 0.007*"would" + '
  '0.006*"time" + 0.006*"american" + 0.006*"go" + 0.006*"act" + 0.005*"work" + '
  '0.005*"year"'),
 (18,
  '0.009*"house" + 0.008*"go" + 0.008*"speaker" + 0.006*"representative" + '
  '0.006*"support" + 0.006*"people" + 0.006*"say" + 0.006*"year" + '
  '0.006*"work" + 0.006*"bill"'),
 (19,
  '0.009*"bill" + 0.007*"representative" + 0.007*"house" + 0.007*"congress" + '
  '0.007*"american" + 0.006*"state" + 0.006*"go" + 0.006*"support" + '
  '0.006*"year" + 0.006*"people"')]

# # Visualize the topics
# import pyLDAvis
# import pyLDAvis.gensim  
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis

# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()
 
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
 
for i, grp in sent_topics_outdf_grpd:
    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, 
                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], 
                                            axis=0)

# Reset Index    
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
 
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
 
# Show
sent_topics_sorteddf_mallet.head()

	content
0	<DOC>\n<DOCNO>Mrs. JONES of Ohio. (PERSONAL EX...
1	<DOC>\n<DOCNO>Ms. ROS-LEHTINEN. (TOM LANTOS AN...
2	<DOC>\n<DOCNO>Ms. WATERS. (PROVIDING FOR CONSI...
3	<DOC>\n<DOCNO>Mrs. DAVIS of California. (PROVI...
4	<DOC>\n<DOCNO>Mrs. NAPOLITANO. (PASSENGER RAIL...

	Dominant_Topic	Perc_Contribution	Topic_Keywords	Text
0	1	0.5471	bill, house, say, speaker, year, support, time...	Mrs. JONES of Ohio. (PERSONAL EXPLANATION -- ...
1	1	0.2777	bill, house, say, speaker, year, support, time...	Ms. ROS-LEHTINEN. (TOM LANTOS AND HENRY J. HY...
2	10	0.3945	house, go, speaker, bill, american, work, make...	Ms. WATERS. (PROVIDING FOR CONSIDERATION OF S...
3	6	0.3046	house, go, work, american, speaker, not, state...	Mrs. DAVIS of California. (PROVIDING FOR CONS...
4	13	0.6177	house, bill, act, say, go, state, american, re...	Mrs. NAPOLITANO. (PASSENGER RAIL INVESTMENT A...
5	1	0.2588	bill, house, say, speaker, year, support, time...	Ms. SCHWARTZ. (WELCOMING MEMBERS OF PARLIAMEN...
6	1	0.6765	bill, house, say, speaker, year, support, time...	Ms. BERKLEY. (RECOVERY REBATES AND ECONOMIC S...
7	2	0.3375	bill, house, go, would, year, time, not, ameri...	Ms. ESHOO. (CONFERENCE REPORT ON H.R. 4040, C...
8	6	0.2586	house, go, work, american, speaker, not, state...	Ms. SCHAKOWSKY. (PROTECT AMERICA ACT OF 2007 ...
9	1	0.5075	bill, house, say, speaker, year, support, time...	Ms. GIFFORDS. (EMPLOYEE VERIFICATION AMENDMEN...
10	1	0.4394	bill, house, say, speaker, year, support, time...	Ms. PRYCE of Ohio. (SUPPORTING EFFORTS TO INC...
11	6	0.7472	house, go, work, american, speaker, not, state...	Mrs. SCHMIDT. (HONORING THE LIFE OF PATRICIA ...
12	6	0.2564	house, go, work, american, speaker, not, state...	Mrs. BIGGERT. (RECOVERY REBATES AND ECONOMIC ...
13	2	0.4394	bill, house, go, would, year, time, not, ameri...	Ms. GRANGER. (SHOSHONE-PAIUTE TRIBES OF THE D...
14	11	0.5507	go, house, speaker, bill, say, not, american, ...	Mrs. EMERSON. (WILLIAM ``BILL CLAY POST OFFIC...
15	11	0.5633	go, house, speaker, bill, say, not, american, ...	Mrs. BLACKBURN. (THE ENERGY PROBLEM IS ONE WE...
16	13	0.2226	house, bill, act, say, go, state, american, re...	Mrs. CAPITO. (WE CANNOT ALLOW OUR DOMESTIC EN...
17	13	0.4348	house, bill, act, say, go, state, american, re...	Ms. FOXX. (D&D DISPLAYS INNOVATES IN NORTH WI...
18	1	0.4321	bill, house, say, speaker, year, support, time...	Mrs. JO ANN DAVIS of Virginia. (COMMEMORATING...
19	6	0.4760	house, go, work, american, speaker, not, state...	Mrs. BONO. (HONORING JACK VALENTI -- (House o...
20	17	0.5430	bill, house, go, would, people, speaker, time,...	Mr. BERRY. (RELATING TO THE HOUSE PROCEDURES ...
21	13	0.2873	house, bill, act, say, go, state, american, re...	Mr. McNERNEY. (PROVIDING FOR CONSIDERATION OF...
22	6	0.3873	house, go, work, american, speaker, not, state...	Mr. ANDREWS. (CONDEMNING THE PERSECUTION OF L...
23	17	0.4443	bill, house, go, would, people, speaker, time,...	Mr. POMEROY. (RELATING TO THE HOUSE PROCEDURE...
24	11	0.2772	go, house, speaker, bill, say, not, american, ...	Mr. BLUMENAUER. (RECOVERY REBATES AND ECONOMI...
25	8	0.2865	house, time, go, speaker, would, make, year, r...	Mr. PATRICK J. MURPHY of Pennsylvania. (EMPLO...
26	10	0.5393	house, go, speaker, bill, american, work, make...	Mr. MEEK of Florida. (30-SOMETHING WORKING GR...
27	2	0.2703	bill, house, go, would, year, time, not, ameri...	Mr. REYES. (NATIONAL ENERGY SECURITY INTELLIG...
28	6	0.4751	house, go, work, american, speaker, not, state...	Mr. HARE. (RECOVERY REBATES AND ECONOMIC STIM...
29	17	0.3969	bill, house, go, would, people, speaker, time,...	Mr. SIRES. (TOM LANTOS AND HENRY J. HYDE UNIT...
30	11	0.6178	go, house, speaker, bill, say, not, american, ...	Mr. PITTS. (EXELON -- (House of Representativ...
31	2	0.5347	bill, house, go, would, year, time, not, ameri...	Mr. MORAN of Kansas. (RECOGNIZING THE SPECIAL...
32	2	0.2713	bill, house, go, would, year, time, not, ameri...	Mr. SALI. (HIGH ENERGY PRICES -- (House of Re...
33	11	0.3862	go, house, speaker, bill, say, not, american, ...	Mr. McCARTHY of California. (REPUBLICAN FRESH...
34	2	0.7438	bill, house, go, would, year, time, not, ameri...	Mr. TIAHRT. (EMPLOYEE FREE CHOICE ACT -- (Hou...
35	13	0.4265	house, bill, act, say, go, state, american, re...	Mr. PLATTS. (FREEDOM OF INFORMATION ACT AMEND...
36	17	0.3548	bill, house, go, would, people, speaker, time,...	Mr. SMITH of Nebraska. (EXPAND OUR NATIONS EX...
37	13	0.4737	house, bill, act, say, go, state, american, re...	Mr. HALL of Texas. (PRODUCED WATER UTILIZATIO...
38	1	0.9203	bill, house, say, speaker, year, support, time...	Mr. RENZI. (HAWAIIAN HOMEOWNERSHIP OPPORTUNIT...
39	11	0.2845	go, house, speaker, bill, say, not, american, ...	Mr. FOSSELLA. (TO ELIMINATE THE EXEMPTION FRO...