{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# HW8: Topic Modeling" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "## =======================================================\n", "## IMPORTING\n", "## =======================================================\n", "import os\n", "def get_data_from_files(path):\n", " directory = os.listdir(path)\n", " results = []\n", " for file in directory:\n", " f=open(path+file, encoding = \"ISO-8859-1\")\n", " results.append(f.read())\n", " f.close()\n", " return results\n", "\n", "\n", "## =======================================================\n", "## MODELING\n", "## =======================================================\n", "import pandas as pd\n", "from sklearn.decomposition import LatentDirichletAllocation\n", "from sklearn.feature_extraction.text import CountVectorizer\n", "import gensim\n", "from gensim.utils import simple_preprocess\n", "from gensim.parsing.preprocessing import STOPWORDS\n", "\n", "def run_lda(data, num_topics, stop_words):\n", " cv = CountVectorizer(stop_words = stop_words)\n", " lda_vec = cv.fit_transform(data)\n", " lda_columns = cv.get_feature_names()\n", " corpus = pd.DataFrame(lda_vec.toarray(), columns = lda_columns)\n", " lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10, \n", " learning_method='online')\n", " lda_model = lda.fit_transform(lda_vec)\n", " print_topics(lda, cv)\n", " return lda_model, lda, lda_vec, cv, corpus\n", "\n", "\n", "## =======================================================\n", "## HELPERS\n", "## =======================================================\n", "import numpy as np\n", "np.random.seed(210)\n", "\n", "def print_topics(model, vectorizer, top_n=10):\n", " for idx, topic in enumerate(model.components_):\n", " print(\"Topic %d:\" % (idx))\n", " print([(vectorizer.get_feature_names()[i], topic[i])\n", " for i in topic.argsort()[:-top_n - 1:-1]])\n", " \n", "\n", "## =======================================================\n", "## VISUALIZING\n", "## ======================================================= \n", "import pyLDAvis.sklearn as LDAvis\n", "import pyLDAvis\n", "\n", "def start_vis(lda, lda_vec, cv):\n", " panel = LDAvis.prepare(lda, lda_vec, cv, mds='tsne')\n", "# pyLDAvis.show(panel)\n", " pyLDAvis.save_html(panel, 'FinalProject_lda_2.html')" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | 0 | \n", "labels | \n", "
---|---|---|
0 | \n", "yeah first_person_pronoun want to address the ... | \n", "yes | \n", "
1 | \n", "umm pamela can pronoun hear first_person_prono... | \n", "yes | \n", "
2 | \n", "its on september th kayla and david first_pers... | \n", "yes | \n", "
3 | \n", "hi ladies first_person_pronoun wanted to tell ... | \n", "yes | \n", "
4 | \n", "lord forgive pronoun pronoun dont know what pr... | \n", "yes | \n", "
... | \n", "... | \n", "... | \n", "
561 | \n", "i pray that first_person_pronoun family will r... | \n", "unknown | \n", "
562 | \n", "when asked if pronoun had a last statement pro... | \n", "yes | \n", "
563 | \n", "what is about to transpire in a few moments is... | \n", "no | \n", "
564 | \n", "none | \n", "yes | \n", "
565 | \n", "statement to the media first_person_pronoun at... | \n", "yes | \n", "
566 rows × 2 columns
\n", "