{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# LDA Clean\n",
    "[Tutorial Here](http://dataskunkworks.com/2018/06/06/extracting-topics-from-11000-newsgroups-posts-with-python-gensim-and-lda/)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### OVERVIEW\n",
    "    1. GET the data\n",
    "    2. CLEAN the data with regex\n",
    "    3. TOKENIZE the data\n",
    "    4. REMOVE stopwords from the data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "def get_data(url):\n",
    "    df = pd.read_json(url)\n",
    "    return df\n",
    "\n",
    "def do_some_eda(df):\n",
    "    print(df.target_names.unique())\n",
    "    df.head()\n",
    "\n",
    "import re\n",
    "def clean_data(df):\n",
    "    \n",
    "    text_corpus = df.content.values.tolist()\n",
    "    text_corpus = [re.sub('(<|</)([A-Z])\\w+>', '', doc) for doc in text_corpus] #removing things between <>\n",
    "    text_corpus = [re.sub('\\s+', ' ', doc) for doc in text_corpus] #removing newline \n",
    "    text_corpus = [re.sub(\"\\'\", \"\", doc) for doc in text_corpus] #removing single quotes\n",
    "    return text_corpus\n",
    "\n",
    "import gensim\n",
    "import warnings\n",
    "warnings.simplefilter(\"ignore\", DeprecationWarning)\n",
    " \n",
    "def doc_to_words(sentences):\n",
    "    for sentence in sentences:\n",
    "        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))\n",
    "\n",
    "        \n",
    "from nltk.corpus import stopwords\n",
    "stop_words = stopwords.words('english')\n",
    "stop_words.extend(['from', 'subject', 're', 'edu', 'use'])\n",
    " \n",
    "def remove_stopwords(text):\n",
    "    return [[word for word in gensim.utils.simple_preprocess(str(doc)) \n",
    "             if word not in stop_words] for doc in text_corpus]\n",
    " \n",
    "    \n",
    "import spacy\n",
    "nlp = spacy.load('en', disable=['parser', 'ner'] )\n",
    " \n",
    "def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):\n",
    "    print(len(texts))\n",
    "    texts_out = []\n",
    "    for idx, sent in enumerate(texts):\n",
    "        if (idx) % 500 == 0:\n",
    "            print(str(idx) + ' documents lemmatised')\n",
    "        doc = nlp(\" \".join(sent)) \n",
    "        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])\n",
    "    return texts_out\n",
    " \n",
    "# Create Dictionary\n",
    "import gensim.corpora as corpora\n",
    "def get_model(data_lemmatized):\n",
    "    id2word = corpora.Dictionary(data_lemmatized)\n",
    "\n",
    "    # Create Corpus\n",
    "    corpus = [id2word.doc2bow(text) for text in data_lemmatized]\n",
    "\n",
    "    # Build LDA model\n",
    "    lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,\n",
    "                                               id2word=id2word,\n",
    "                                               num_topics=20, \n",
    "                                               per_word_topics=True)\n",
    "    return corpus, lda_model\n",
    "\n",
    "\n",
    "def format_topics_sentences(ldamodel, corpus, texts):\n",
    "    # Array of top 10 topics\n",
    "    top10array = []\n",
    " \n",
    "    for row in range(ldamodel.num_topics):\n",
    "        wp = ldamodel.show_topic(row)\n",
    "        topic_keywords = \", \".join([word for word, prop in wp])\n",
    "        top10array.append((row+1, topic_keywords))\n",
    " \n",
    "    top10dict = dict(top10array)\n",
    " \n",
    "    sent_topics_df = pd.DataFrame(pd.DataFrame([sorted(topic[0], key=lambda x: (x[1]), reverse=True) \n",
    "                                                for topic in ldamodel[corpus]])[0])\n",
    "    sent_topics_df.columns=[\"Data\"]\n",
    "    sent_topics_df['Dominant_Topic'] = sent_topics_df.Data.apply(lambda x: x[0]+1)\n",
    "    sent_topics_df['Perc_Contribution'] = sent_topics_df.Data.apply(lambda x: round(x[1],4))\n",
    "    sent_topics_df['Topic_Keywords'] = sent_topics_df.Dominant_Topic.apply(lambda x: top10dict[x])\n",
    " \n",
    "    # Add original text to the end of the output\n",
    "    contents = pd.Series(texts)\n",
    "    sent_topics_df = pd.concat([sent_topics_df, contents.rename(\"Text\")], axis=1)\n",
    "    sent_topics_df = sent_topics_df[['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords', 'Text']]\n",
    "    return(sent_topics_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>content</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>&lt;DOC&gt;\\n&lt;DOCNO&gt;Mrs. JONES of Ohio. (PERSONAL EX...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>&lt;DOC&gt;\\n&lt;DOCNO&gt;Ms. ROS-LEHTINEN. (TOM LANTOS AN...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>&lt;DOC&gt;\\n&lt;DOCNO&gt;Ms. WATERS. (PROVIDING FOR CONSI...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>&lt;DOC&gt;\\n&lt;DOCNO&gt;Mrs. DAVIS of California. (PROVI...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>&lt;DOC&gt;\\n&lt;DOCNO&gt;Mrs. NAPOLITANO. (PASSENGER RAIL...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                             content\n",
       "0  <DOC>\\n<DOCNO>Mrs. JONES of Ohio. (PERSONAL EX...\n",
       "1  <DOC>\\n<DOCNO>Ms. ROS-LEHTINEN. (TOM LANTOS AN...\n",
       "2  <DOC>\\n<DOCNO>Ms. WATERS. (PROVIDING FOR CONSI...\n",
       "3  <DOC>\\n<DOCNO>Mrs. DAVIS of California. (PROVI...\n",
       "4  <DOC>\\n<DOCNO>Mrs. NAPOLITANO. (PASSENGER RAIL..."
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# def do_the_thing():\n",
    "# df = get_data('https://raw.githubusercontent.com/selva86/datasets/master/newsgroups.json')\n",
    "\n",
    "import os\n",
    "def get_data_from_files(path):\n",
    "    directory = os.listdir(path)\n",
    "    results = []\n",
    "    for file in directory:\n",
    "        f=open(path+file,  encoding = \"ISO-8859-1\")\n",
    "        results.append(f.read())\n",
    "        f.close()\n",
    "    return results\n",
    "\n",
    "\n",
    "# DATA SET 2\n",
    "data_fd = get_data_from_files('110/110-f-d/')\n",
    "data_fr = get_data_from_files('110/110-f-r/')\n",
    "data_md = get_data_from_files('110/110-m-d/')\n",
    "data_mr = get_data_from_files('110/110-m-r/')\n",
    "\n",
    "female_data = data_fd + data_fr \n",
    "male_data = data_md + data_mr\n",
    "dem_data = data_md + data_fd\n",
    "rep_data = data_mr + data_fr\n",
    "\n",
    "all_data = female_data + male_data\n",
    "\n",
    "# DATA SET 2 -- SMALL\n",
    "female_data_sm = data_fd[:10] + data_fr[:10] \n",
    "male_data_sm = data_md[:10] + data_mr[:10]\n",
    "dem_data = data_md[:10] + data_fd[:10]\n",
    "rep_data = data_mr[:10] + data_fr[:10]\n",
    "\n",
    "all_data = female_data_sm + male_data_sm\n",
    "df = pd.DataFrame({'content': all_data})\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "40\n",
      "0 documents lemmatised\n",
      "[(0,\n",
      "  '0.008*\"bill\" + 0.008*\"house\" + 0.007*\"say\" + 0.007*\"speaker\" + 0.006*\"year\" '\n",
      "  '+ 0.006*\"support\" + 0.006*\"time\" + 0.006*\"would\" + 0.006*\"work\" + '\n",
      "  '0.005*\"go\"'),\n",
      " (1,\n",
      "  '0.009*\"bill\" + 0.009*\"house\" + 0.008*\"go\" + 0.007*\"would\" + 0.007*\"year\" + '\n",
      "  '0.006*\"time\" + 0.006*\"not\" + 0.006*\"american\" + 0.006*\"speaker\" + '\n",
      "  '0.006*\"representative\"'),\n",
      " (2,\n",
      "  '0.008*\"house\" + 0.008*\"speaker\" + 0.007*\"would\" + 0.007*\"time\" + '\n",
      "  '0.007*\"support\" + 0.006*\"year\" + 0.006*\"bill\" + 0.006*\"go\" + 0.006*\"make\" + '\n",
      "  '0.006*\"act\"'),\n",
      " (3,\n",
      "  '0.009*\"go\" + 0.009*\"representative\" + 0.008*\"house\" + 0.008*\"bill\" + '\n",
      "  '0.008*\"people\" + 0.007*\"american\" + 0.007*\"say\" + 0.007*\"speaker\" + '\n",
      "  '0.007*\"make\" + 0.006*\"year\"'),\n",
      " (4,\n",
      "  '0.009*\"speaker\" + 0.008*\"house\" + 0.008*\"go\" + 0.007*\"make\" + 0.006*\"time\" '\n",
      "  '+ 0.006*\"bill\" + 0.006*\"people\" + 0.006*\"american\" + 0.006*\"say\" + '\n",
      "  '0.006*\"not\"'),\n",
      " (5,\n",
      "  '0.010*\"house\" + 0.008*\"go\" + 0.007*\"work\" + 0.007*\"american\" + '\n",
      "  '0.007*\"speaker\" + 0.007*\"not\" + 0.007*\"state\" + 0.006*\"year\" + '\n",
      "  '0.006*\"representative\" + 0.006*\"would\"'),\n",
      " (6,\n",
      "  '0.009*\"go\" + 0.008*\"american\" + 0.007*\"bill\" + 0.007*\"house\" + 0.007*\"say\" '\n",
      "  '+ 0.006*\"speaker\" + 0.006*\"time\" + 0.006*\"make\" + 0.006*\"would\" + '\n",
      "  '0.005*\"not\"'),\n",
      " (7,\n",
      "  '0.007*\"house\" + 0.007*\"time\" + 0.007*\"go\" + 0.007*\"speaker\" + 0.007*\"would\" '\n",
      "  '+ 0.007*\"make\" + 0.006*\"year\" + 0.006*\"representative\" + 0.006*\"people\" + '\n",
      "  '0.006*\"bill\"'),\n",
      " (8,\n",
      "  '0.008*\"house\" + 0.007*\"speaker\" + 0.007*\"people\" + 0.007*\"bill\" + '\n",
      "  '0.007*\"representative\" + 0.006*\"american\" + 0.006*\"year\" + 0.005*\"act\" + '\n",
      "  '0.005*\"say\" + 0.005*\"work\"'),\n",
      " (9,\n",
      "  '0.011*\"house\" + 0.010*\"go\" + 0.008*\"speaker\" + 0.008*\"bill\" + '\n",
      "  '0.008*\"american\" + 0.008*\"work\" + 0.007*\"make\" + 0.007*\"people\" + '\n",
      "  '0.006*\"want\" + 0.006*\"say\"'),\n",
      " (10,\n",
      "  '0.010*\"go\" + 0.009*\"house\" + 0.008*\"speaker\" + 0.008*\"bill\" + 0.008*\"say\" + '\n",
      "  '0.007*\"not\" + 0.007*\"american\" + 0.007*\"think\" + 0.006*\"year\" + '\n",
      "  '0.006*\"people\"'),\n",
      " (11,\n",
      "  '0.008*\"house\" + 0.007*\"say\" + 0.007*\"bill\" + 0.007*\"would\" + 0.007*\"go\" + '\n",
      "  '0.006*\"speaker\" + 0.006*\"people\" + 0.006*\"state\" + 0.006*\"american\" + '\n",
      "  '0.006*\"time\"'),\n",
      " (12,\n",
      "  '0.009*\"house\" + 0.007*\"bill\" + 0.006*\"act\" + 0.006*\"say\" + 0.006*\"go\" + '\n",
      "  '0.006*\"state\" + 0.006*\"american\" + 0.006*\"representative\" + 0.006*\"year\" + '\n",
      "  '0.006*\"people\"'),\n",
      " (13,\n",
      "  '0.008*\"house\" + 0.007*\"go\" + 0.007*\"speaker\" + 0.007*\"american\" + '\n",
      "  '0.007*\"would\" + 0.006*\"representative\" + 0.006*\"bill\" + 0.006*\"people\" + '\n",
      "  '0.006*\"year\" + 0.006*\"state\"'),\n",
      " (14,\n",
      "  '0.012*\"bill\" + 0.008*\"house\" + 0.008*\"speaker\" + 0.007*\"go\" + 0.007*\"would\" '\n",
      "  '+ 0.006*\"representative\" + 0.006*\"year\" + 0.006*\"time\" + 0.005*\"people\" + '\n",
      "  '0.005*\"american\"'),\n",
      " (15,\n",
      "  '0.009*\"house\" + 0.009*\"speaker\" + 0.009*\"say\" + 0.008*\"go\" + 0.007*\"time\" + '\n",
      "  '0.007*\"american\" + 0.006*\"people\" + 0.006*\"not\" + 0.006*\"state\" + '\n",
      "  '0.006*\"want\"'),\n",
      " (16,\n",
      "  '0.009*\"bill\" + 0.008*\"house\" + 0.007*\"go\" + 0.007*\"would\" + 0.006*\"people\" '\n",
      "  '+ 0.006*\"speaker\" + 0.005*\"time\" + 0.005*\"act\" + 0.005*\"representative\" + '\n",
      "  '0.005*\"american\"'),\n",
      " (17,\n",
      "  '0.008*\"house\" + 0.008*\"speaker\" + 0.007*\"bill\" + 0.007*\"would\" + '\n",
      "  '0.006*\"time\" + 0.006*\"american\" + 0.006*\"go\" + 0.006*\"act\" + 0.005*\"work\" + '\n",
      "  '0.005*\"year\"'),\n",
      " (18,\n",
      "  '0.009*\"house\" + 0.008*\"go\" + 0.008*\"speaker\" + 0.006*\"representative\" + '\n",
      "  '0.006*\"support\" + 0.006*\"people\" + 0.006*\"say\" + 0.006*\"year\" + '\n",
      "  '0.006*\"work\" + 0.006*\"bill\"'),\n",
      " (19,\n",
      "  '0.009*\"bill\" + 0.007*\"representative\" + 0.007*\"house\" + 0.007*\"congress\" + '\n",
      "  '0.007*\"american\" + 0.006*\"state\" + 0.006*\"go\" + 0.006*\"support\" + '\n",
      "  '0.006*\"year\" + 0.006*\"people\"')]\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Dominant_Topic</th>\n",
       "      <th>Perc_Contribution</th>\n",
       "      <th>Topic_Keywords</th>\n",
       "      <th>Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0.5471</td>\n",
       "      <td>bill, house, say, speaker, year, support, time...</td>\n",
       "      <td>Mrs. JONES of Ohio. (PERSONAL EXPLANATION -- ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0.2777</td>\n",
       "      <td>bill, house, say, speaker, year, support, time...</td>\n",
       "      <td>Ms. ROS-LEHTINEN. (TOM LANTOS AND HENRY J. HY...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>10</td>\n",
       "      <td>0.3945</td>\n",
       "      <td>house, go, speaker, bill, american, work, make...</td>\n",
       "      <td>Ms. WATERS. (PROVIDING FOR CONSIDERATION OF S...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>6</td>\n",
       "      <td>0.3046</td>\n",
       "      <td>house, go, work, american, speaker, not, state...</td>\n",
       "      <td>Mrs. DAVIS of California. (PROVIDING FOR CONS...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>13</td>\n",
       "      <td>0.6177</td>\n",
       "      <td>house, bill, act, say, go, state, american, re...</td>\n",
       "      <td>Mrs. NAPOLITANO. (PASSENGER RAIL INVESTMENT A...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>1</td>\n",
       "      <td>0.2588</td>\n",
       "      <td>bill, house, say, speaker, year, support, time...</td>\n",
       "      <td>Ms. SCHWARTZ. (WELCOMING MEMBERS OF PARLIAMEN...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>0.6765</td>\n",
       "      <td>bill, house, say, speaker, year, support, time...</td>\n",
       "      <td>Ms. BERKLEY. (RECOVERY REBATES AND ECONOMIC S...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>2</td>\n",
       "      <td>0.3375</td>\n",
       "      <td>bill, house, go, would, year, time, not, ameri...</td>\n",
       "      <td>Ms. ESHOO. (CONFERENCE REPORT ON H.R. 4040, C...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>6</td>\n",
       "      <td>0.2586</td>\n",
       "      <td>house, go, work, american, speaker, not, state...</td>\n",
       "      <td>Ms. SCHAKOWSKY. (PROTECT AMERICA ACT OF 2007 ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>1</td>\n",
       "      <td>0.5075</td>\n",
       "      <td>bill, house, say, speaker, year, support, time...</td>\n",
       "      <td>Ms. GIFFORDS. (EMPLOYEE VERIFICATION AMENDMEN...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>1</td>\n",
       "      <td>0.4394</td>\n",
       "      <td>bill, house, say, speaker, year, support, time...</td>\n",
       "      <td>Ms. PRYCE of Ohio. (SUPPORTING EFFORTS TO INC...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>6</td>\n",
       "      <td>0.7472</td>\n",
       "      <td>house, go, work, american, speaker, not, state...</td>\n",
       "      <td>Mrs. SCHMIDT. (HONORING THE LIFE OF PATRICIA ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>6</td>\n",
       "      <td>0.2564</td>\n",
       "      <td>house, go, work, american, speaker, not, state...</td>\n",
       "      <td>Mrs. BIGGERT. (RECOVERY REBATES AND ECONOMIC ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>2</td>\n",
       "      <td>0.4394</td>\n",
       "      <td>bill, house, go, would, year, time, not, ameri...</td>\n",
       "      <td>Ms. GRANGER. (SHOSHONE-PAIUTE TRIBES OF THE D...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14</td>\n",
       "      <td>11</td>\n",
       "      <td>0.5507</td>\n",
       "      <td>go, house, speaker, bill, say, not, american, ...</td>\n",
       "      <td>Mrs. EMERSON. (WILLIAM ``BILL CLAY POST OFFIC...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>15</td>\n",
       "      <td>11</td>\n",
       "      <td>0.5633</td>\n",
       "      <td>go, house, speaker, bill, say, not, american, ...</td>\n",
       "      <td>Mrs. BLACKBURN. (THE ENERGY PROBLEM IS ONE WE...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>16</td>\n",
       "      <td>13</td>\n",
       "      <td>0.2226</td>\n",
       "      <td>house, bill, act, say, go, state, american, re...</td>\n",
       "      <td>Mrs. CAPITO. (WE CANNOT ALLOW OUR DOMESTIC EN...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>17</td>\n",
       "      <td>13</td>\n",
       "      <td>0.4348</td>\n",
       "      <td>house, bill, act, say, go, state, american, re...</td>\n",
       "      <td>Ms. FOXX. (D&amp;D DISPLAYS INNOVATES IN NORTH WI...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>18</td>\n",
       "      <td>1</td>\n",
       "      <td>0.4321</td>\n",
       "      <td>bill, house, say, speaker, year, support, time...</td>\n",
       "      <td>Mrs. JO ANN DAVIS of Virginia. (COMMEMORATING...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>19</td>\n",
       "      <td>6</td>\n",
       "      <td>0.4760</td>\n",
       "      <td>house, go, work, american, speaker, not, state...</td>\n",
       "      <td>Mrs. BONO. (HONORING JACK VALENTI -- (House o...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>17</td>\n",
       "      <td>0.5430</td>\n",
       "      <td>bill, house, go, would, people, speaker, time,...</td>\n",
       "      <td>Mr. BERRY. (RELATING TO THE HOUSE PROCEDURES ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>21</td>\n",
       "      <td>13</td>\n",
       "      <td>0.2873</td>\n",
       "      <td>house, bill, act, say, go, state, american, re...</td>\n",
       "      <td>Mr. McNERNEY. (PROVIDING FOR CONSIDERATION OF...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>22</td>\n",
       "      <td>6</td>\n",
       "      <td>0.3873</td>\n",
       "      <td>house, go, work, american, speaker, not, state...</td>\n",
       "      <td>Mr. ANDREWS. (CONDEMNING THE PERSECUTION OF L...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>23</td>\n",
       "      <td>17</td>\n",
       "      <td>0.4443</td>\n",
       "      <td>bill, house, go, would, people, speaker, time,...</td>\n",
       "      <td>Mr. POMEROY. (RELATING TO THE HOUSE PROCEDURE...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>24</td>\n",
       "      <td>11</td>\n",
       "      <td>0.2772</td>\n",
       "      <td>go, house, speaker, bill, say, not, american, ...</td>\n",
       "      <td>Mr. BLUMENAUER. (RECOVERY REBATES AND ECONOMI...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25</td>\n",
       "      <td>8</td>\n",
       "      <td>0.2865</td>\n",
       "      <td>house, time, go, speaker, would, make, year, r...</td>\n",
       "      <td>Mr. PATRICK J. MURPHY of Pennsylvania. (EMPLO...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>26</td>\n",
       "      <td>10</td>\n",
       "      <td>0.5393</td>\n",
       "      <td>house, go, speaker, bill, american, work, make...</td>\n",
       "      <td>Mr. MEEK of Florida. (30-SOMETHING WORKING GR...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>27</td>\n",
       "      <td>2</td>\n",
       "      <td>0.2703</td>\n",
       "      <td>bill, house, go, would, year, time, not, ameri...</td>\n",
       "      <td>Mr. REYES. (NATIONAL ENERGY SECURITY INTELLIG...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>28</td>\n",
       "      <td>6</td>\n",
       "      <td>0.4751</td>\n",
       "      <td>house, go, work, american, speaker, not, state...</td>\n",
       "      <td>Mr. HARE. (RECOVERY REBATES AND ECONOMIC STIM...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>29</td>\n",
       "      <td>17</td>\n",
       "      <td>0.3969</td>\n",
       "      <td>bill, house, go, would, people, speaker, time,...</td>\n",
       "      <td>Mr. SIRES. (TOM LANTOS AND HENRY J. HYDE UNIT...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>30</td>\n",
       "      <td>11</td>\n",
       "      <td>0.6178</td>\n",
       "      <td>go, house, speaker, bill, say, not, american, ...</td>\n",
       "      <td>Mr. PITTS. (EXELON -- (House of Representativ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>31</td>\n",
       "      <td>2</td>\n",
       "      <td>0.5347</td>\n",
       "      <td>bill, house, go, would, year, time, not, ameri...</td>\n",
       "      <td>Mr. MORAN of Kansas. (RECOGNIZING THE SPECIAL...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>32</td>\n",
       "      <td>2</td>\n",
       "      <td>0.2713</td>\n",
       "      <td>bill, house, go, would, year, time, not, ameri...</td>\n",
       "      <td>Mr. SALI. (HIGH ENERGY PRICES -- (House of Re...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>33</td>\n",
       "      <td>11</td>\n",
       "      <td>0.3862</td>\n",
       "      <td>go, house, speaker, bill, say, not, american, ...</td>\n",
       "      <td>Mr. McCARTHY of California. (REPUBLICAN FRESH...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>34</td>\n",
       "      <td>2</td>\n",
       "      <td>0.7438</td>\n",
       "      <td>bill, house, go, would, year, time, not, ameri...</td>\n",
       "      <td>Mr. TIAHRT. (EMPLOYEE FREE CHOICE ACT -- (Hou...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>35</td>\n",
       "      <td>13</td>\n",
       "      <td>0.4265</td>\n",
       "      <td>house, bill, act, say, go, state, american, re...</td>\n",
       "      <td>Mr. PLATTS. (FREEDOM OF INFORMATION ACT AMEND...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>36</td>\n",
       "      <td>17</td>\n",
       "      <td>0.3548</td>\n",
       "      <td>bill, house, go, would, people, speaker, time,...</td>\n",
       "      <td>Mr. SMITH of Nebraska. (EXPAND OUR NATIONS EX...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>37</td>\n",
       "      <td>13</td>\n",
       "      <td>0.4737</td>\n",
       "      <td>house, bill, act, say, go, state, american, re...</td>\n",
       "      <td>Mr. HALL of Texas. (PRODUCED WATER UTILIZATIO...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>38</td>\n",
       "      <td>1</td>\n",
       "      <td>0.9203</td>\n",
       "      <td>bill, house, say, speaker, year, support, time...</td>\n",
       "      <td>Mr. RENZI. (HAWAIIAN HOMEOWNERSHIP OPPORTUNIT...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>39</td>\n",
       "      <td>11</td>\n",
       "      <td>0.2845</td>\n",
       "      <td>go, house, speaker, bill, say, not, american, ...</td>\n",
       "      <td>Mr. FOSSELLA. (TO ELIMINATE THE EXEMPTION FRO...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Dominant_Topic  Perc_Contribution  \\\n",
       "0                1             0.5471   \n",
       "1                1             0.2777   \n",
       "2               10             0.3945   \n",
       "3                6             0.3046   \n",
       "4               13             0.6177   \n",
       "5                1             0.2588   \n",
       "6                1             0.6765   \n",
       "7                2             0.3375   \n",
       "8                6             0.2586   \n",
       "9                1             0.5075   \n",
       "10               1             0.4394   \n",
       "11               6             0.7472   \n",
       "12               6             0.2564   \n",
       "13               2             0.4394   \n",
       "14              11             0.5507   \n",
       "15              11             0.5633   \n",
       "16              13             0.2226   \n",
       "17              13             0.4348   \n",
       "18               1             0.4321   \n",
       "19               6             0.4760   \n",
       "20              17             0.5430   \n",
       "21              13             0.2873   \n",
       "22               6             0.3873   \n",
       "23              17             0.4443   \n",
       "24              11             0.2772   \n",
       "25               8             0.2865   \n",
       "26              10             0.5393   \n",
       "27               2             0.2703   \n",
       "28               6             0.4751   \n",
       "29              17             0.3969   \n",
       "30              11             0.6178   \n",
       "31               2             0.5347   \n",
       "32               2             0.2713   \n",
       "33              11             0.3862   \n",
       "34               2             0.7438   \n",
       "35              13             0.4265   \n",
       "36              17             0.3548   \n",
       "37              13             0.4737   \n",
       "38               1             0.9203   \n",
       "39              11             0.2845   \n",
       "\n",
       "                                       Topic_Keywords  \\\n",
       "0   bill, house, say, speaker, year, support, time...   \n",
       "1   bill, house, say, speaker, year, support, time...   \n",
       "2   house, go, speaker, bill, american, work, make...   \n",
       "3   house, go, work, american, speaker, not, state...   \n",
       "4   house, bill, act, say, go, state, american, re...   \n",
       "5   bill, house, say, speaker, year, support, time...   \n",
       "6   bill, house, say, speaker, year, support, time...   \n",
       "7   bill, house, go, would, year, time, not, ameri...   \n",
       "8   house, go, work, american, speaker, not, state...   \n",
       "9   bill, house, say, speaker, year, support, time...   \n",
       "10  bill, house, say, speaker, year, support, time...   \n",
       "11  house, go, work, american, speaker, not, state...   \n",
       "12  house, go, work, american, speaker, not, state...   \n",
       "13  bill, house, go, would, year, time, not, ameri...   \n",
       "14  go, house, speaker, bill, say, not, american, ...   \n",
       "15  go, house, speaker, bill, say, not, american, ...   \n",
       "16  house, bill, act, say, go, state, american, re...   \n",
       "17  house, bill, act, say, go, state, american, re...   \n",
       "18  bill, house, say, speaker, year, support, time...   \n",
       "19  house, go, work, american, speaker, not, state...   \n",
       "20  bill, house, go, would, people, speaker, time,...   \n",
       "21  house, bill, act, say, go, state, american, re...   \n",
       "22  house, go, work, american, speaker, not, state...   \n",
       "23  bill, house, go, would, people, speaker, time,...   \n",
       "24  go, house, speaker, bill, say, not, american, ...   \n",
       "25  house, time, go, speaker, would, make, year, r...   \n",
       "26  house, go, speaker, bill, american, work, make...   \n",
       "27  bill, house, go, would, year, time, not, ameri...   \n",
       "28  house, go, work, american, speaker, not, state...   \n",
       "29  bill, house, go, would, people, speaker, time,...   \n",
       "30  go, house, speaker, bill, say, not, american, ...   \n",
       "31  bill, house, go, would, year, time, not, ameri...   \n",
       "32  bill, house, go, would, year, time, not, ameri...   \n",
       "33  go, house, speaker, bill, say, not, american, ...   \n",
       "34  bill, house, go, would, year, time, not, ameri...   \n",
       "35  house, bill, act, say, go, state, american, re...   \n",
       "36  bill, house, go, would, people, speaker, time,...   \n",
       "37  house, bill, act, say, go, state, american, re...   \n",
       "38  bill, house, say, speaker, year, support, time...   \n",
       "39  go, house, speaker, bill, say, not, american, ...   \n",
       "\n",
       "                                                 Text  \n",
       "0    Mrs. JONES of Ohio. (PERSONAL EXPLANATION -- ...  \n",
       "1    Ms. ROS-LEHTINEN. (TOM LANTOS AND HENRY J. HY...  \n",
       "2    Ms. WATERS. (PROVIDING FOR CONSIDERATION OF S...  \n",
       "3    Mrs. DAVIS of California. (PROVIDING FOR CONS...  \n",
       "4    Mrs. NAPOLITANO. (PASSENGER RAIL INVESTMENT A...  \n",
       "5    Ms. SCHWARTZ. (WELCOMING MEMBERS OF PARLIAMEN...  \n",
       "6    Ms. BERKLEY. (RECOVERY REBATES AND ECONOMIC S...  \n",
       "7    Ms. ESHOO. (CONFERENCE REPORT ON H.R. 4040, C...  \n",
       "8    Ms. SCHAKOWSKY. (PROTECT AMERICA ACT OF 2007 ...  \n",
       "9    Ms. GIFFORDS. (EMPLOYEE VERIFICATION AMENDMEN...  \n",
       "10   Ms. PRYCE of Ohio. (SUPPORTING EFFORTS TO INC...  \n",
       "11   Mrs. SCHMIDT. (HONORING THE LIFE OF PATRICIA ...  \n",
       "12   Mrs. BIGGERT. (RECOVERY REBATES AND ECONOMIC ...  \n",
       "13   Ms. GRANGER. (SHOSHONE-PAIUTE TRIBES OF THE D...  \n",
       "14   Mrs. EMERSON. (WILLIAM ``BILL CLAY POST OFFIC...  \n",
       "15   Mrs. BLACKBURN. (THE ENERGY PROBLEM IS ONE WE...  \n",
       "16   Mrs. CAPITO. (WE CANNOT ALLOW OUR DOMESTIC EN...  \n",
       "17   Ms. FOXX. (D&D DISPLAYS INNOVATES IN NORTH WI...  \n",
       "18   Mrs. JO ANN DAVIS of Virginia. (COMMEMORATING...  \n",
       "19   Mrs. BONO. (HONORING JACK VALENTI -- (House o...  \n",
       "20   Mr. BERRY. (RELATING TO THE HOUSE PROCEDURES ...  \n",
       "21   Mr. McNERNEY. (PROVIDING FOR CONSIDERATION OF...  \n",
       "22   Mr. ANDREWS. (CONDEMNING THE PERSECUTION OF L...  \n",
       "23   Mr. POMEROY. (RELATING TO THE HOUSE PROCEDURE...  \n",
       "24   Mr. BLUMENAUER. (RECOVERY REBATES AND ECONOMI...  \n",
       "25   Mr. PATRICK J. MURPHY of Pennsylvania. (EMPLO...  \n",
       "26   Mr. MEEK of Florida. (30-SOMETHING WORKING GR...  \n",
       "27   Mr. REYES. (NATIONAL ENERGY SECURITY INTELLIG...  \n",
       "28   Mr. HARE. (RECOVERY REBATES AND ECONOMIC STIM...  \n",
       "29   Mr. SIRES. (TOM LANTOS AND HENRY J. HYDE UNIT...  \n",
       "30   Mr. PITTS. (EXELON -- (House of Representativ...  \n",
       "31   Mr. MORAN of Kansas. (RECOGNIZING THE SPECIAL...  \n",
       "32   Mr. SALI. (HIGH ENERGY PRICES -- (House of Re...  \n",
       "33   Mr. McCARTHY of California. (REPUBLICAN FRESH...  \n",
       "34   Mr. TIAHRT. (EMPLOYEE FREE CHOICE ACT -- (Hou...  \n",
       "35   Mr. PLATTS. (FREEDOM OF INFORMATION ACT AMEND...  \n",
       "36   Mr. SMITH of Nebraska. (EXPAND OUR NATIONS EX...  \n",
       "37   Mr. HALL of Texas. (PRODUCED WATER UTILIZATIO...  \n",
       "38   Mr. RENZI. (HAWAIIAN HOMEOWNERSHIP OPPORTUNIT...  \n",
       "39   Mr. FOSSELLA. (TO ELIMINATE THE EXEMPTION FRO...  "
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# do_some_eda(df)\n",
    "text_corpus = clean_data(df)\n",
    "words = list(doc_to_words(text_corpus)) \n",
    "\n",
    "words = remove_stopwords(words)\n",
    "# words\n",
    "data_lemmatized = lemmatization(words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])\n",
    "\n",
    "corpus, lda_model = get_model(data_lemmatized)\n",
    "\n",
    "from pprint import pprint\n",
    "pprint(lda_model.print_topics())\n",
    "doc_lda = lda_model[corpus]\n",
    "df_topic_sents_keywords = format_topics_sentences(lda_model, corpus, text_corpus)\n",
    "df_topic_sents_keywords\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Visualize the topics\n",
    "# import pyLDAvis\n",
    "# import pyLDAvis.gensim  \n",
    "# pyLDAvis.enable_notebook()\n",
    "# vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)\n",
    "# vis\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Group top 5 sentences under each topic\n",
    "sent_topics_sorteddf_mallet = pd.DataFrame()\n",
    " \n",
    "sent_topics_outdf_grpd = df_topic_sents_keywords.groupby('Dominant_Topic')\n",
    " \n",
    "for i, grp in sent_topics_outdf_grpd:\n",
    "    sent_topics_sorteddf_mallet = pd.concat([sent_topics_sorteddf_mallet, \n",
    "                                             grp.sort_values(['Perc_Contribution'], ascending=[0]).head(1)], \n",
    "                                            axis=0)\n",
    "\n",
    "# Reset Index    \n",
    "sent_topics_sorteddf_mallet.reset_index(drop=True, inplace=True)\n",
    " \n",
    "# Format\n",
    "sent_topics_sorteddf_mallet.columns = ['Topic_Num', \"Topic_Perc_Contrib\", \"Keywords\", \"Text\"]\n",
    " \n",
    "# Show\n",
    "sent_topics_sorteddf_mallet.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
