{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# LDA\n", "[Tutorial Here](http://dataskunkworks.com/2018/06/06/extracting-topics-from-11000-newsgroups-posts-with-python-gensim-and-lda/)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'\n", " 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'\n", " 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'\n", " 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'\n", " 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'\n", " 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']\n", "['rec.autos' 'comp.sys.mac.hardware' 'comp.graphics' 'sci.space'\n", " 'talk.politics.guns' 'sci.med' 'comp.sys.ibm.pc.hardware'\n", " 'comp.os.ms-windows.misc' 'rec.motorcycles' 'talk.religion.misc'\n", " 'misc.forsale' 'alt.atheism' 'sci.electronics' 'comp.windows.x'\n", " 'rec.sport.hockey' 'rec.sport.baseball' 'soc.religion.christian'\n", " 'talk.politics.mideast' 'talk.politics.misc' 'sci.crypt']\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
contenttargettarget_names
0From: lerxst@wam.umd.edu (where's my thing)\\nS...7rec.autos
1From: guykuo@carson.u.washington.edu (Guy Kuo)...4comp.sys.mac.hardware
2From: twillis@ec.ecn.purdue.edu (Thomas E Will...4comp.sys.mac.hardware
3From: jgreen@amber (Joe Green)\\nSubject: Re: W...1comp.graphics
4From: jcm@head-cfa.harvard.edu (Jonathan McDow...14sci.space
\n", "
" ], "text/plain": [ " content target \\\n", "0 From: lerxst@wam.umd.edu (where's my thing)\\nS... 7 \n", "1 From: guykuo@carson.u.washington.edu (Guy Kuo)... 4 \n", "2 From: twillis@ec.ecn.purdue.edu (Thomas E Will... 4 \n", "3 From: jgreen@amber (Joe Green)\\nSubject: Re: W... 1 \n", "4 From: jcm@head-cfa.harvard.edu (Jonathan McDow... 14 \n", "\n", " target_names \n", "0 rec.autos \n", "1 comp.sys.mac.hardware \n", "2 comp.sys.mac.hardware \n", "3 comp.graphics \n", "4 sci.space " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "import pandas as pd\n", "df = pd.read_json('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')\n", "print(df.target_names.unique())\n", "df.head()\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "From: (Guy Kuo) Subject: SI Clock Poll - Final Call Summary: Final call for SI clock reports Keywords: SI,acceleration,clock,upgrade Article-I.D.: shelley.1qvfo9INNc3s Organization: University of Washington Lines: 11 NNTP-Posting-Host: carson.u.washington.edu A fair number of brave souls who upgraded their SI clock oscillator have shared their experiences for this poll. Please send a brief message detailing your experiences with the procedure. Top speed attained, CPU rated speed, add on cards and adapters, heat sinks, hour of usage per day, floppy disk functionality with 800 and 1.4 m floppies are especially requested. I will be summarizing in the next two days, so please add to the network knowledge base if you have done the clock upgrade and havent answered this poll. Thanks. Guy Kuo \n" ] } ], "source": [ "import re\n", "text_corpus = df.content.values.tolist()\n", "text_corpus = [re.sub('\\S*@\\S*\\s?', '', doc) for doc in text_corpus] #removing email addresses\n", "text_corpus = [re.sub('\\s+', ' ', doc) for doc in text_corpus] #removing newline characters\n", "text_corpus = [re.sub(\"\\'\", \"\", doc) for doc in text_corpus] #removing single quote characters\n", " \n", "print(text_corpus[1])" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['from', 'guy', 'kuo', 'subject', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'for', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'of', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'edu', 'fair', 'number', 'of', 'brave', 'souls', 'who', 'upgraded', 'their', 'si', 'clock', 'oscillator', 'have', 'shared', 'their', 'experiences', 'for', 'this', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'your', 'experiences', 'with', 'the', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'on', 'cards', 'and', 'adapters', 'heat', 'sinks', 'hour', 'of', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'with', 'and', 'floppies', 'are', 'especially', 'requested', 'will', 'be', 'summarizing', 'in', 'the', 'next', 'two', 'days', 'so', 'please', 'add', 'to', 'the', 'network', 'knowledge', 'base', 'if', 'you', 'have', 'done', 'the', 'clock', 'upgrade', 'and', 'havent', 'answered', 'this', 'poll', 'thanks', 'guy', 'kuo']\n" ] } ], "source": [ "import gensim\n", "import warnings\n", "warnings.simplefilter(\"ignore\", DeprecationWarning)\n", " \n", "def doc_to_words(sentences):\n", " for sentence in sentences:\n", " yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n", "\n", "words = list(doc_to_words(text_corpus)) \n", "print(words[1])" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['guy', 'kuo', 'si', 'clock', 'poll', 'final', 'call', 'summary', 'final', 'call', 'si', 'clock', 'reports', 'keywords', 'si', 'acceleration', 'clock', 'upgrade', 'article', 'shelley', 'qvfo', 'innc', 'organization', 'university', 'washington', 'lines', 'nntp', 'posting', 'host', 'carson', 'washington', 'fair', 'number', 'brave', 'souls', 'upgraded', 'si', 'clock', 'oscillator', 'shared', 'experiences', 'poll', 'please', 'send', 'brief', 'message', 'detailing', 'experiences', 'procedure', 'top', 'speed', 'attained', 'cpu', 'rated', 'speed', 'add', 'cards', 'adapters', 'heat', 'sinks', 'hour', 'usage', 'per', 'day', 'floppy', 'disk', 'functionality', 'floppies', 'especially', 'requested', 'summarizing', 'next', 'two', 'days', 'please', 'add', 'network', 'knowledge', 'base', 'done', 'clock', 'upgrade', 'havent', 'answered', 'poll', 'thanks', 'guy', 'kuo']\n" ] } ], "source": [ "from nltk.corpus import stopwords\n", "stop_words = stopwords.words('english')\n", "stop_words.extend(['from', 'subject', 're', 'edu', 'use'])\n", " \n", "def remove_stopwords(text):\n", " return [[word for word in gensim.utils.simple_preprocess(str(doc)) if word not in stop_words] for doc in text_corpus]\n", " \n", "words = remove_stopwords(words)\n", " \n", "print(words[1])" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "0 documents lemmatised\n", "500 documents lemmatised\n", "1000 documents lemmatised\n", "1500 documents lemmatised\n", "2000 documents lemmatised\n", "2500 documents lemmatised\n", "3000 documents lemmatised\n", "3500 documents lemmatised\n", "4000 documents lemmatised\n", "4500 documents lemmatised\n", "5000 documents lemmatised\n", "5500 documents lemmatised\n", "6000 documents lemmatised\n", "6500 documents lemmatised\n", "7000 documents lemmatised\n", "7500 documents lemmatised\n", "8000 documents lemmatised\n", "8500 documents lemmatised\n", "9000 documents lemmatised\n", "9500 documents lemmatised\n", "10000 documents lemmatised\n", "10500 documents lemmatised\n", "11000 documents lemmatised\n" ] } ], "source": [ "import spacy\n", "nlp = spacy.load('en', disable=['parser', 'ner'] )\n", "# nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])\n", " \n", "def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):\n", " texts_out = []\n", " for idx, sent in enumerate(texts):\n", " if (idx) % 500 == 0:\n", " print(str(idx) + ' documents lemmatised')\n", " doc = nlp(\" \".join(sent)) \n", " texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n", " return texts_out\n", " \n", "data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "\n", "# Create Dictionary\n", "import gensim.corpora as corpora\n", "id2word = corpora.Dictionary(data_lemmatized)\n", " \n", "# Create Corpus\n", "corpus = [id2word.doc2bow(text) for text in data_lemmatized]\n", " \n", "# Build LDA model\n", "lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,\n", " id2word=id2word,\n", " num_topics=20, \n", " per_word_topics=True)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "[(0,\n", " '0.013*\"drive\" + 0.012*\"car\" + 0.012*\"not\" + 0.009*\"line\" + '\n", " '0.008*\"organization\" + 0.007*\"write\" + 0.007*\"article\" + 0.007*\"do\" + '\n", " '0.006*\"be\" + 0.006*\"light\"'),\n", " (1,\n", " '0.695*\"ax\" + 0.048*\"max\" + 0.009*\"_\" + 0.004*\"rlk\" + 0.003*\"bhj\" + '\n", " '0.003*\"ey\" + 0.003*\"giz\" + 0.002*\"qax\" + 0.002*\"tm\" + 0.002*\"chz\"'),\n", " (2,\n", " '0.027*\"key\" + 0.015*\"encryption\" + 0.014*\"clipper\" + 0.014*\"chip\" + '\n", " '0.009*\"government\" + 0.008*\"security\" + 0.007*\"escrow\" + 0.007*\"system\" + '\n", " '0.007*\"would\" + 0.007*\"public\"'),\n", " (3,\n", " '0.008*\"state\" + 0.008*\"not\" + 0.006*\"law\" + 0.006*\"write\" + '\n", " '0.006*\"armenian\" + 0.005*\"government\" + 0.005*\"line\" + 0.005*\"would\" + '\n", " '0.004*\"exist\" + 0.004*\"question\"'),\n", " (4,\n", " '0.014*\"not\" + 0.010*\"do\" + 0.008*\"would\" + 0.008*\"line\" + 0.008*\"write\" + '\n", " '0.007*\"year\" + 0.007*\"be\" + 0.007*\"get\" + 0.007*\"organization\" + '\n", " '0.006*\"time\"'),\n", " (5,\n", " '0.017*\"key\" + 0.015*\"bit\" + 0.012*\"line\" + 0.012*\"_\" + 0.009*\"organization\" '\n", " '+ 0.008*\"use\" + 0.008*\"number\" + 0.008*\"serial\" + 0.008*\"window\" + '\n", " '0.008*\"c\"'),\n", " (6,\n", " '0.030*\"space\" + 0.012*\"nasa\" + 0.011*\"orbit\" + 0.010*\"mission\" + '\n", " '0.009*\"mar\" + 0.007*\"earth\" + 0.007*\"satellite\" + 0.006*\"spacecraft\" + '\n", " '0.006*\"shuttle\" + 0.006*\"probe\"'),\n", " (7,\n", " '0.013*\"game\" + 0.012*\"hockey\" + 0.010*\"cub\" + 0.009*\"league\" + '\n", " '0.008*\"division\" + 0.008*\"db\" + 0.007*\"lose\" + 0.006*\"hawk\" + 0.006*\"line\" '\n", " '+ 0.006*\"min\"'),\n", " (8,\n", " '0.015*\"not\" + 0.010*\"say\" + 0.009*\"do\" + 0.009*\"people\" + 0.008*\"would\" + '\n", " '0.007*\"be\" + 0.007*\"write\" + 0.007*\"think\" + 0.007*\"god\" + 0.007*\"know\"'),\n", " (9,\n", " '0.013*\"greek\" + 0.010*\"article\" + 0.009*\"write\" + 0.009*\"organization\" + '\n", " '0.009*\"get\" + 0.009*\"line\" + 0.008*\"not\" + 0.007*\"car\" + 0.007*\"greece\" + '\n", " '0.007*\"dealer\"'),\n", " (10,\n", " '0.014*\"wire\" + 0.011*\"ground\" + 0.008*\"ax\" + 0.008*\"line\" + '\n", " '0.008*\"organization\" + 0.007*\"outlet\" + 0.006*\"post\" + 0.006*\"neutral\" + '\n", " '0.006*\"wiring\" + 0.006*\"write\"'),\n", " (11,\n", " '0.029*\"bike\" + 0.014*\"line\" + 0.013*\"dod\" + 0.012*\"organization\" + '\n", " '0.012*\"motorcycle\" + 0.012*\"ride\" + 0.011*\"rider\" + 0.008*\"post\" + '\n", " '0.007*\"nntp\" + 0.007*\"host\"'),\n", " (12,\n", " '0.021*\"not\" + 0.012*\"do\" + 0.010*\"write\" + 0.008*\"be\" + 0.008*\"line\" + '\n", " '0.007*\"get\" + 0.007*\"would\" + 0.007*\"drive\" + 0.007*\"scsi\" + '\n", " '0.007*\"article\"'),\n", " (13,\n", " '0.010*\"write\" + 0.008*\"line\" + 0.008*\"not\" + 0.008*\"article\" + '\n", " '0.008*\"organization\" + 0.006*\"would\" + 0.006*\"israel\" + 0.006*\"do\" + '\n", " '0.006*\"state\" + 0.006*\"right\"'),\n", " (14,\n", " '0.015*\"gun\" + 0.012*\"game\" + 0.011*\"team\" + 0.008*\"not\" + 0.008*\"play\" + '\n", " '0.007*\"get\" + 0.007*\"line\" + 0.007*\"organization\" + 0.007*\"year\" + '\n", " '0.006*\"go\"'),\n", " (15,\n", " '0.007*\"mail\" + 0.007*\"_\" + 0.006*\"file\" + 0.006*\"cx\" + 0.006*\"ripem\" + '\n", " '0.006*\"line\" + 0.006*\"list\" + 0.006*\"information\" + 0.005*\"post\" + '\n", " '0.005*\"available\"'),\n", " (16,\n", " '0.014*\"line\" + 0.014*\"drive\" + 0.012*\"organization\" + 0.010*\"card\" + '\n", " '0.009*\"university\" + 0.008*\"disk\" + 0.008*\"driver\" + 0.008*\"post\" + '\n", " '0.008*\"video\" + 0.008*\"write\"'),\n", " (17,\n", " '0.011*\"new\" + 0.011*\"line\" + 0.010*\"organization\" + 0.009*\"gm\" + '\n", " '0.008*\"university\" + 0.008*\"not\" + 0.007*\"car\" + 0.006*\"write\" + '\n", " '0.006*\"good\" + 0.005*\"think\"'),\n", " (18,\n", " '0.020*\"line\" + 0.016*\"organization\" + 0.011*\"post\" + 0.011*\"file\" + '\n", " '0.010*\"host\" + 0.009*\"nntp\" + 0.008*\"program\" + 0.008*\"window\" + '\n", " '0.007*\"university\" + 0.007*\"get\"'),\n", " (19,\n", " '0.010*\"not\" + 0.007*\"do\" + 0.007*\"get\" + 0.006*\"window\" + '\n", " '0.005*\"organization\" + 0.005*\"be\" + 0.005*\"line\" + 0.005*\"space\" + '\n", " '0.005*\"would\" + 0.005*\"use\"')]\n" ] } ], "source": [ "from pprint import pprint\n", "pprint(lda_model.print_topics())\n", "doc_lda = lda_model[corpus]" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "ename": "KeyboardInterrupt", "evalue": "", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mKeyboardInterrupt\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpyLDAvis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgensim\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mpyLDAvis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0menable_notebook\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 5\u001b[0;31m \u001b[0mvis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpyLDAvis\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgensim\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mprepare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlda_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mid2word\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 6\u001b[0m \u001b[0mvis\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pyLDAvis/gensim.py\u001b[0m in \u001b[0;36mprepare\u001b[0;34m(topic_model, corpus, dictionary, doc_topic_dist, **kwargs)\u001b[0m\n\u001b[1;32m 117\u001b[0m \"\"\"\n\u001b[1;32m 118\u001b[0m \u001b[0mopts\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mfp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmerge\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0m_extract_data\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtopic_model\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcorpus\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdictionary\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdoc_topic_dist\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 119\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mvis_prepare\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0mopts\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py\u001b[0m in \u001b[0;36mprepare\u001b[0;34m(topic_term_dists, doc_topic_dists, doc_lengths, vocab, term_frequency, R, lambda_step, mds, n_jobs, plot_opts, sort_topics)\u001b[0m\n\u001b[1;32m 396\u001b[0m \u001b[0mterm_frequency\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mterm_topic_freq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 397\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 398\u001b[0;31m \u001b[0mtopic_info\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_topic_info\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtopic_term_dists\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtopic_proportion\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mterm_frequency\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mterm_topic_freq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocab\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mlambda_step\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mR\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mn_jobs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 399\u001b[0m \u001b[0mtoken_table\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_token_table\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtopic_info\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mterm_topic_freq\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvocab\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mterm_frequency\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 400\u001b[0m \u001b[0mtopic_coordinates\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_topic_coordinates\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmds\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtopic_term_dists\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtopic_proportion\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py\u001b[0m in \u001b[0;36m_topic_info\u001b[0;34m(topic_term_dists, topic_proportion, term_frequency, term_topic_freq, vocab, lambda_step, R, n_jobs)\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 254\u001b[0m top_terms = pd.concat(Parallel(n_jobs=n_jobs)(delayed(_find_relevance_chunks)(log_ttd, log_lift, R, ls) \\\n\u001b[0;32m--> 255\u001b[0;31m for ls in _job_chunks(lambda_seq, n_jobs)))\n\u001b[0m\u001b[1;32m 256\u001b[0m \u001b[0mtopic_dfs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmap\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtopic_top_term_df\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0menumerate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtop_terms\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mT\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miterrows\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 257\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconcat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdefault_term_info\u001b[0m\u001b[0;34m]\u001b[0m \u001b[0;34m+\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtopic_dfs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, iterable)\u001b[0m\n\u001b[1;32m 932\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 933\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieval_context\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 934\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mretrieve\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 935\u001b[0m \u001b[0;31m# Make sure that we get a last message telling us we are done\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 936\u001b[0m \u001b[0melapsed_time\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtime\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_start_time\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/parallel.py\u001b[0m in \u001b[0;36mretrieve\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 831\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 832\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_backend\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'supports_timeout'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;32mFalse\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 833\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 834\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 835\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_output\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mextend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mjob\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py\u001b[0m in \u001b[0;36mwrap_future_result\u001b[0;34m(future, timeout)\u001b[0m\n\u001b[1;32m 519\u001b[0m AsyncResults.get from multiprocessing.\"\"\"\n\u001b[1;32m 520\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 521\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mfuture\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mresult\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 522\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mLokyTimeoutError\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[0;32mraise\u001b[0m \u001b[0mTimeoutError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/concurrent/futures/_base.py\u001b[0m in \u001b[0;36mresult\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 428\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__get_result\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 429\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 430\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_condition\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwait\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtimeout\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 431\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 432\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_state\u001b[0m \u001b[0;32min\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0mCANCELLED\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mCANCELLED_AND_NOTIFIED\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/anaconda3/lib/python3.7/threading.py\u001b[0m in \u001b[0;36mwait\u001b[0;34m(self, timeout)\u001b[0m\n\u001b[1;32m 294\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0;31m# restore state no matter what (e.g., KeyboardInterrupt)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mtimeout\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 296\u001b[0;31m \u001b[0mwaiter\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0macquire\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 297\u001b[0m \u001b[0mgotit\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 298\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mKeyboardInterrupt\u001b[0m: " ] } ], "source": [ "# Visualize the topics\n", "import pyLDAvis\n", "import pyLDAvis.gensim \n", "pyLDAvis.enable_notebook()\n", "vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)\n", "vis\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def format_topics_sentences(ldamodel=lda_model, corpus=corpus, texts=text_corpus):\n", " # Array of top 10 topics\n", " top10array = []\n", " \n", " for row in range(ldamodel.num_topics):\n", " wp = ldamodel.show_topic(row)\n", " topic_keywords = \", \".join([word for word, prop in wp])\n", " top10array.append((row+1, topic_keywords))\n", " \n", " top10dict = dict(top10array)\n", " \n", " sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) for topic in ldamodel[corpus]])[0])\n", " sent_topics_df.columns=[\"Data\"]\n", " sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)\n", " sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))\n", " sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])\n", " \n", " # Add original text to the end of the output\n", " contents = pd.Series(texts)\n", " sent_topics_df = pd.concat([sent_topics_df, contents.rename(\"Text\")], axis=1)\n", " sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]\n", " return(sent_topics_df)\n", " \n", "df_topic_sents_keywords = format_topics_sentences()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Group top 5 sentences under each topic\n", "sent_topics_sorteddf_mallet = pd.DataFrame()\n", " \n", "sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')\n", " \n", "for i, grp in sent_topics_outdf_grpd:\n", " sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, \n", " grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], \n", " axis=0)\n", " \n", "# Reset Index \n", "sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)\n", " \n", "# Format\n", "sent_topics_sorteddf_mallet.columns = ['Topic_Num', \"Topic_Perc_Contrib\", \"Keywords\", \"Text\"]\n", " \n", "# Show\n", "sent_topics_sorteddf_mallet.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 2 }