## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
directory = os.listdir(path)
results = []
for file in directory:
f=open(path+file, encoding = "ISO-8859-1")
results.append(f.read())
f.close()
return results
## =======================================================
## MODELING
## =======================================================
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
def run_lda(data, num_topics, stop_words):
cv = CountVectorizer(stop_words = stop_words)
lda_vec = cv.fit_transform(data)
lda_columns = cv.get_feature_names()
corpus = pd.DataFrame(lda_vec.toarray(), columns = lda_columns)
lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10,
learning_method='online')
lda_model = lda.fit_transform(lda_vec)
print_topics(lda, cv)
return lda_model, lda, lda_vec, cv, corpus
## =======================================================
## HELPERS
## =======================================================
import numpy as np
np.random.seed(210)
def print_topics(model, vectorizer, top_n=10):
for idx, topic in enumerate(model.components_):
print("Topic %d:" % (idx))
print([(vectorizer.get_feature_names()[i], topic[i])
for i in topic.argsort()[:-top_n - 1:-1]])
## =======================================================
## VISUALIZING
## =======================================================
import pyLDAvis.sklearn as LDAvis
import pyLDAvis
def start_vis(lda, lda_vec, cv):
panel = LDAvis.prepare(lda, lda_vec, cv, mds='tsne')
# pyLDAvis.show(panel)
pyLDAvis.save_html(panel, 'FinalProject_lda_2.html')
df = pd.read_csv('../death_row_discritized.csv')
def to_string(tokens):
try:
return " ".join(eval(tokens))
except:
return "error"
df['statement_string'] = df.apply(lambda x: to_string(x['last_statement']), axis=1)
# y=df['vic_kid'].values
y=df['prior_record'].values
y_labels = list(set(y))
X=df['statement_string'].values
all_df = pd.DataFrame(X)
all_df['labels'] = y
all_df
# data = get_data_from_files('Dog_Hike/')
# lda_model, lda, lda_vec, cv = run_lda(data,)
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS
# data_fd = get_data_from_files('110/110-f-d/')
# data_fr = get_data_from_files('110/110-f-r/')
# data = data_fd + data_fr
# data
lda_model, lda, lda_vec, cv, corpus = run_lda(all_df[0].values, 4, stop_words)
start_vis(lda, lda_vec, cv)
# corpus
# c2 = corpus.append(df.sum().rename('Total'))
ct = corpus.T
ct['total'] = ct.sum(axis=1)
big_total = ct[ct['total'] > 68]
len(big_total)
len(ct)
btt = big_total.T
additional_stopwords = btt.columns
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
stop_words
lda_model, lda, lda_vec, cv, corpus = run_lda(data, 40, stop_words)
start_vis(lda, lda_vec, cv)
import plotly.plotly as py
from plotly.grid_objs import Grid, Column
from plotly.tools import FigureFactory as FF
import pandas as pd
import time