{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This is a tutorial on using sklearn for topic modeling. \n",
    "\n",
    "References:\n",
    "\n",
    "(1) http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html\n",
    "\n",
    "(2) https://medium.com/mlreview/topic-modeling-with-scikit-learn-e80d33668730\n",
    "https://nlpforhackers.io/topic-modeling/\n",
    "\n",
    "(3) https://nlpforhackers.io/topic-modeling/\n",
    "\n",
    "(4) http://derekgreene.com/slides/topic-modelling-with-scikitlearn.pdf"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# First read in a data file \"20newsgroups\" from sklearn's own data sets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['alt.atheism', 'comp.graphics', 'comp.os.ms-windows.misc', 'comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware', 'comp.windows.x', 'misc.forsale', 'rec.autos', 'rec.motorcycles', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']\n",
      "11314\n"
     ]
    }
   ],
   "source": [
    "from sklearn.datasets import fetch_20newsgroups\n",
    "\n",
    "dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))\n",
    "documents = dataset.data\n",
    "print(dataset.target_names)\n",
    "print(len(documents))\n",
    "#print(documents[0])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Then vectorize the text files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "\n",
    "no_features = 1000\n",
    "\n",
    "# NMF is able to use tf-idf\n",
    "#tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')\n",
    "#tfidf = tfidf_vectorizer.fit_transform(documents)\n",
    "#tfidf_feature_names = tfidf_vectorizer.get_feature_names()\n",
    "\n",
    "# LDA can only use raw term counts for LDA because it is a probabilistic graphical model\n",
    "tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')\n",
    "tf = tf_vectorizer.fit_transform(documents)\n",
    "tf_feature_names = tf_vectorizer.get_feature_names()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Then call the LDA algorithm to fit a topic model, and transform all documents to their topic distrinbutions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import NMF, LatentDirichletAllocation\n",
    "\n",
    "no_topics = 20\n",
    "\n",
    "# Run NMF\n",
    "#nmf = NMF(n_components=no_topics, random_state=1, alpha=.1, l1_ratio=.5, init='nndsvd').fit(tfidf)\n",
    "\n",
    "# Run LDA\n",
    "lda = LatentDirichletAllocation(n_components=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0)\n",
    "lda_z = lda.fit_transform(tf)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def display_topics(model, feature_names, no_top_words):\n",
    "    for topic_idx, topic in enumerate(model.components_):\n",
    "        print(\"Topic %d:\" % (topic_idx))\n",
    "        print(\" \".join([feature_names[i]\n",
    "                        for i in topic.argsort()[:-no_top_words - 1:-1]]))\n",
    "\n",
    "no_top_words = 10"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic 0:\n",
      "people gun state control right guns crime states law police\n",
      "Topic 1:\n",
      "time question book years did like don space answer just\n",
      "Topic 2:\n",
      "mr line rules science stephanopoulos title current define int yes\n",
      "Topic 3:\n",
      "key chip keys clipper encryption number des algorithm use bit\n",
      "Topic 4:\n",
      "edu com cs vs w7 cx mail uk 17 send\n",
      "Topic 5:\n",
      "use does window problem way used point different case value\n",
      "Topic 6:\n",
      "windows thanks know help db does dos problem like using\n",
      "Topic 7:\n",
      "bike water effect road design media dod paper like turn\n",
      "Topic 8:\n",
      "don just like think know people good ve going say\n",
      "Topic 9:\n",
      "car new price good power used air sale offer ground\n",
      "Topic 10:\n",
      "file available program edu ftp information files use image version\n",
      "Topic 11:\n",
      "ax max b8f g9v a86 145 pl 1d9 0t 34u\n",
      "Topic 12:\n",
      "government law privacy security legal encryption court fbi technology information\n",
      "Topic 13:\n",
      "card bit memory output video color data mode monitor 16\n",
      "Topic 14:\n",
      "drive scsi disk mac hard apple drives controller software port\n",
      "Topic 15:\n",
      "god jesus people believe christian bible say does life church\n",
      "Topic 16:\n",
      "year game team games season play hockey players league player\n",
      "Topic 17:\n",
      "10 00 15 25 20 11 12 14 16 13\n",
      "Topic 18:\n",
      "armenian israel armenians war people jews turkish israeli said women\n",
      "Topic 19:\n",
      "president people new said health year university school day work\n"
     ]
    }
   ],
   "source": [
    "display_topics(lda, tf_feature_names, no_top_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(11314, 20)\n",
      "[0.00172414 0.00172414 0.00172414 0.00172414 0.00172414 0.00172414\n",
      " 0.00172414 0.46904086 0.14219449 0.00172414 0.00172414 0.00172414\n",
      " 0.00172414 0.00172414 0.00172414 0.00172414 0.00172414 0.00172414\n",
      " 0.3594543  0.00172414]\n",
      "Well i'm not sure about the story nad it did seem biased. What\n",
      "I disagree with is your statement that the U.S. Media is out to\n",
      "ruin Israels reputation. That is rediculous. The U.S. media is\n",
      "the most pro-israeli media in the world. Having lived in Europe\n",
      "I realize that incidences such as the one described in the\n",
      "letter have occured. The U.S. media as a whole seem to try to\n",
      "ignore them. The U.S. is subsidizing Israels existance and the\n",
      "Europeans are not (at least not to the same degree). So I think\n",
      "that might be a reason they report more clearly on the\n",
      "atrocities.\n",
      "\tWhat is a shame is that in Austria, daily reports of\n",
      "the inhuman acts commited by Israeli soldiers and the blessing\n",
      "received from the Government makes some of the Holocaust guilt\n",
      "go away. After all, look how the Jews are treating other races\n",
      "when they got power. It is unfortunate.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "#print(lda.components_[0][:5])\n",
    "print(lda_z.shape)\n",
    "print(lda_z[0])\n",
    "print(documents[0])\n",
    "#print(documents.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Topic 0:\n",
      "people time right did good said say make way government\n",
      "Topic 1:\n",
      "window problem using server application screen display motif manager running\n",
      "Topic 2:\n",
      "god jesus bible christ faith believe christian christians sin church\n",
      "Topic 3:\n",
      "game team year games season players play hockey win league\n",
      "Topic 4:\n",
      "new 00 sale 10 price offer shipping condition 20 15\n",
      "Topic 5:\n",
      "thanks mail advance hi looking info help information address appreciated\n",
      "Topic 6:\n",
      "windows file files dos program version ftp ms directory running\n",
      "Topic 7:\n",
      "edu soon cs university ftp internet article email pub david\n",
      "Topic 8:\n",
      "key chip clipper encryption keys escrow government public algorithm nsa\n",
      "Topic 9:\n",
      "drive scsi drives hard disk ide floppy controller cd mac\n",
      "Topic 10:\n",
      "just ll thought tell oh little fine work wanted mean\n",
      "Topic 11:\n",
      "does know anybody mean work say doesn help exist program\n",
      "Topic 12:\n",
      "card video monitor cards drivers bus vga driver color memory\n",
      "Topic 13:\n",
      "like sounds looks look bike sound lot things really thing\n",
      "Topic 14:\n",
      "don know want let need doesn little sure sorry things\n",
      "Topic 15:\n",
      "car cars engine speed good bike driver road insurance fast\n",
      "Topic 16:\n",
      "ve got seen heard tried good recently times try couple\n",
      "Topic 17:\n",
      "use used using work available want software need image data\n",
      "Topic 18:\n",
      "think don lot try makes really pretty wasn bit david\n",
      "Topic 19:\n",
      "com list dave internet article sun hp email ibm phone\n"
     ]
    }
   ],
   "source": [
    "#display_topics(nmf, tfidf_feature_names, no_top_words)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "thylacine.txt\n",
      "elizabeth_needham.txt\n",
      "gunnhild.txt\n",
      "uranus.txt\n",
      "yard.txt\n",
      "zinta.txt\n",
      "equipartition_theorem.txt\n",
      "sunderland_echo.txt\n",
      "thespis.txt\n",
      "hawes.txt\n",
      "hill.txt\n",
      "shiloh.txt\n",
      "12\n"
     ]
    }
   ],
   "source": [
    "# to read in files from a folder\n",
    "import os\n",
    "files = {}\n",
    "filepath = '../A-data/mallet-sample-data'\n",
    "for filename in os.listdir(filepath):\n",
    "    print(filename)\n",
    "    if filename.endswith(\".txt\"):\n",
    "        fpath = filepath + '/' + filename\n",
    "        with open(fpath, \"r\") as file:\n",
    "            files[filename] = file.read()\n",
    "print(len(files)) \n",
    "\n",
    "#for filename, text in files.items():\n",
    "#    print(filename)\n",
    "#    print(\"=\" * 80)\n",
    "#    print(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
