## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file, encoding = "ISO-8859-1")
results.append(f.read())
f.close()
return results
## =======================================================
## MODELING
## =======================================================
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
def run_lda(data,n_components, cv):
lda_vec = cv.fit_transform(data)
lda_columns = cv.get_feature_names()
corpus = pd.DataFrame(lda_vec.toarray(), columns = lda_columns)
lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
learning_method='online')
lda_model = lda.fit_transform(lda_vec)
print_topics(lda, cv)
return lda_model, lda, lda_vec, cv
## =======================================================
## HELPERS
## =======================================================
import numpy as np
np.random.seed(210)
def print_topics(model, vectorizer, top_n=10):
for idx, topic in enumerate(model.components_):
print("Topic %d:" % (idx))
print([(vectorizer.get_feature_names()[i], topic[i])
for i in topic.argsort()[:-top_n - 1:-1]])
## =======================================================
## VISUALIZING
## =======================================================
import pyLDAvis.sklearn as LDAvis
import pyLDAvis
def start_vis(lda, lda_vec, cv):
panel = LDAvis.prepare(lda, lda_vec, cv, mds='tsne')
# pyLDAvis.show(panel)
pyLDAvis.save_html(panel, 'HW8_lda_all_2.html')
# DATA SET 1
# data = get_data_from_files('Dog_Hike/')
# lda_model, lda, lda_vec, cv = run_lda(data)
# DATA SET 2
data_fd = get_data_from_files('110/110-f-d/')
data_fr = get_data_from_files('110/110-f-r/')
data_md = get_data_from_files('110/110-m-d/')
data_mr = get_data_from_files('110/110-m-r/')
female_data = data_fd + data_fr
male_data = data_md + data_mr
dem_data = data_md + data_fd
rep_data = data_mr + data_fr
all_data = female_data + male_data
# DATA SET 2 -- SMALL
female_data_sm = data_fd[:10] + data_fr[:10]
male_data_sm = data_md[:10] + data_mr[:10]
dem_data = data_md[:10] + data_fd[:10]
rep_data = data_mr[:10] + data_fr[:10]
all_data_sm = female_data_sm + male_data_sm
cv = CountVectorizer()
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
cv = CountVectorizer(stop_words='english')
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
from sklearn.feature_extraction import text
additional_stopwords = ['mr', 'docno', 'house',
'speaker', 'text', 'congress', 'representatives']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
cv = CountVectorizer(stop_words=stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
from sklearn.feature_extraction import text
additional_stopwords = ['mr', 'docno', 'house',
'speaker', 'text', 'congress', 'representatives',
'doc', 'time', 'want', 'today', '2007']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
cv = CountVectorizer(stop_words=stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
from sklearn.feature_extraction import text
additional_stopwords = ['mr', 'docno', 'house',
'speaker', 'text', 'congress', 'representatives',
'doc', 'time', 'want', 'today', '2007', 'support', 'american',
'president', 'ms', 'mrs', 'going', 'think', 'just', 'know',
'make', 'people']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
cv = CountVectorizer(ngram_range = (1,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
from sklearn.feature_extraction import text
additional_stopwords = ['mr', 'docno', 'house',
'speaker', 'text', 'congress', 'representatives',
'doc', 'time', 'want', 'today', '2007', 'support', 'american',
'president', 'ms', 'mrs', 'going', 'think', 'just', 'know',
'make', 'people']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
cv = CountVectorizer(ngram_range = (2,2), stop_words = "english")
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
from sklearn.feature_extraction import text
additional_stopwords = ['mr', 'docno', 'house',
'speaker', 'text', 'congress', 'representatives',
'doc', 'time', 'want', 'today', '2007', 'support', 'american',
'president', 'ms', 'mrs', 'going', 'think', 'just', 'know',
'make', 'people']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
cv = CountVectorizer(ngram_range = (2,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
from sklearn.feature_extraction import text
additional_stopwords = ['mr', 'docno', 'house',
'speaker', 'text', 'congress', 'representatives',
'doc', 'time', 'want', 'today', '2007', 'support', 'american',
'president', 'ms', 'mrs', 'going', 'think', 'just', 'know',
'make', 'people']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
cv = CountVectorizer(ngram_range = (1,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 10, cv)
from sklearn.feature_extraction import text
additional_stopwords = [
'2007',
'2008',
'act',
'american',
'chairman',
'committee',
'congress',
'country',
'doc',
'docno',
'don',
'floor',
'going',
'government',
'house',
'important',
'just',
'know',
'legislation',
'like',
'madam',
'make',
'members',
'mr',
'mrs',
'ms',
'need',
'new',
'people',
'president',
'representatives',
'say',
'speaker',
'state',
'states',
'support',
'text',
'thank',
'think',
'time',
'today',
'want',
'work',
'year'
]
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
cv = CountVectorizer(ngram_range = (1,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 10, cv)
additional_stopwords = ['mr', 'docno', 'house',
'speaker', 'text', 'congress', 'representatives',
'doc', 'time', 'want', 'today', '2007', 'support', 'american',
'president', 'ms', 'mrs', 'going', 'think', 'just', 'know',
'make', 'people', 'act', 'thank', 'need', 'work', 'country',
'say', 'year', 'chairman', 'new', 'important','2008',
'legislation', 'members', 'state', 'states', 'madam', 'floor',
'like', 'don', 'government', 'committee']
sorted(additional_stopwords)
from sklearn.feature_extraction import text
additional_stopwords = [
'2007',
'2008',
'act',
'american',
'chairman',
'committee',
'congress',
'country',
'doc',
'docno',
'don',
'floor',
'going',
'government',
'house',
'important',
'just',
'know',
'legislation',
'like',
'madam',
'make',
'members',
'mr',
'mrs',
'ms',
'need',
'new',
'people',
'president',
'representatives',
'say',
'speaker',
'state',
'states',
'support',
'text',
'thank',
'think',
'time',
'today',
'want',
'work',
'year'
]
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
cv = CountVectorizer(ngram_range = (1,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data, 40, cv)
start_vis(lda, lda_vec, cv)