1. GET the data
2. CLEAN the data with regex
3. TOKENIZE the data
4. REMOVE stopwords from the data
import pandas as pd
def get_data(url):
df = pd.read_json(url)
return df
def do_some_eda(df):
print(df.target_names.unique())
df.head()
import re
def clean_data(df):
text_corpus = df.content.values.tolist()
text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing emails
text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline
text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quotes
return text_corpus
import gensim
import warnings
warnings.simplefilter("ignore", DeprecationWarning)
def doc_to_words(sentences):
for sentence in sentences:
yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
def remove_stopwords(text):
return [[word for word in gensim.utils.simple_preprocess(str(doc))
if word not in stop_words] for doc in text_corpus]
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
texts_out = []
for idx, sent in enumerate(texts):
if (idx) % 500 == 0:
print(str(idx) + ' documents lemmatised')
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
# Create Dictionary
import gensim.corpora as corpora
def get_model(data_lemmatized):
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=20,
per_word_topics=True)
return corpus, lda_model
def format_topics_sentences(ldamodel, corpus, texts):
# Array of top 10 topics
top10array = []
for row in range(ldamodel.num_topics):
wp = ldamodel.show_topic(row)
topic_keywords = ", ".join([word for word, prop in wp])
top10array.append((row+1, topic_keywords))
top10dict = dict(top10array)
sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True)
for topic in ldamodel[corpus]])[0])
sent_topics_df.columns=["Data"]
sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
return(sent_topics_df)
# def do_the_thing():
df = get_data('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')
# do_some_eda(df)
text_corpus = clean_data(df)
words = list(doc_to_words(text_corpus))
words = remove_stopwords(words)
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
corpus, lda_model = get_model(data_lemmatized)
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, text_corpus)
df_topic_sents_keywords
# import re
# text_corpus = df.content.values.tolist()
# text_corpus = [re.sub('\S*@\S*\s?', '', doc) for doc in text_corpus] #removing email addresses
# text_corpus = [re.sub('\s+', ' ', doc) for doc in text_corpus] #removing newline characters
# text_corpus = [re.sub("\'", "", doc) for doc in text_corpus] #removing single quote characters
# print(text_corpus[1])
# import gensim
# import warnings
# warnings.simplefilter("ignore", DeprecationWarning)
# def doc_to_words(sentences):
# for sentence in sentences:
# yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
# words = list(doc_to_words(text_corpus))
# print(words[1])
# from nltk.corpus import stopwords
# stop_words = stopwords.words('english')
# stop_words.extend(['from', 'subject', 're', 'edu', 'use'])
# def remove_stopwords(text):
# return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in text_corpus]
# words = remove_stopwords(words)
# print(words[1])
import spacy
nlp = spacy.load('en', disable=['parser', 'ner'] )
# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
texts_out = []
for idx, sent in enumerate(texts):
if (idx) % 500 == 0:
print(str(idx) + ' documents lemmatised')
doc = nlp(" ".join(sent))
texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
return texts_out
data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])
# Create Dictionary
import gensim.corpora as corpora
id2word = corpora.Dictionary(data_lemmatized)
# Create Corpus
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
id2word=id2word,
num_topics=20,
per_word_topics=True)
from pprint import pprint
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]
# # Visualize the topics
# import pyLDAvis
# import pyLDAvis.gensim
# pyLDAvis.enable_notebook()
# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
# vis
def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=text_corpus):
# Array of top 10 topics
top10array = []
for row in range(ldamodel.num_topics):
wp = ldamodel.show_topic(row)
topic_keywords = ", ".join([word for word, prop in wp])
top10array.append((row+1, topic_keywords))
top10dict = dict(top10array)
sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) for topic in ldamodel[corpus]])[0])
sent_topics_df.columns=["Data"]
sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)
sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))
sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])
# Add original text to the end of the output
contents = pd.Series(texts)
sent_topics_df = pd.concat([sent_topics_df, contents.rename("Text")], axis=1)
sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]
return(sent_topics_df)
df_topic_sents_keywords = format_topics_sentences()
# Group top 5 sentences under each topic
sent_topics_sorteddf_mallet = pd.DataFrame()
sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')
for i, grp in sent_topics_outdf_grpd:
sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet,
grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)],
axis=0)
# Reset Index
sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)
# Format
sent_topics_sorteddf_mallet.columns = ['Topic_Num', "Topic_Perc_Contrib", "Keywords", "Text"]
# Show
sent_topics_sorteddf_mallet.head()