HW8: Topic Modeling

In [ ]:
 
In [7]:
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file,  encoding = "ISO-8859-1")
        results.append(f.read())
        f.close()
    return results


## =======================================================
## MODELING
## =======================================================
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def run_lda(data, num_topics, stop_words):
    cv = CountVectorizer(stop_words = stop_words)
    lda_vec = cv.fit_transform(data)
    lda_columns = cv.get_feature_names()
    corpus = pd.DataFrame(lda_vec.toarray(), columns = lda_columns)
    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10, 
                                    learning_method='online')
    lda_model = lda.fit_transform(lda_vec)
    print_topics(lda, cv)
    return lda_model, lda, lda_vec, cv, corpus


## =======================================================
## HELPERS
## =======================================================
import numpy as np
np.random.seed(210)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        

## =======================================================
## VISUALIZING
## =======================================================        
import pyLDAvis.sklearn as LDAvis
import pyLDAvis

def start_vis(lda, lda_vec, cv):
    panel = LDAvis.prepare(lda, lda_vec, cv, mds='tsne')
#     pyLDAvis.show(panel)
    pyLDAvis.save_html(panel, 'FinalProject_lda_2.html')
In [8]:
df = pd.read_csv('../death_row_discritized.csv')

def to_string(tokens):
    try:
        return " ".join(eval(tokens))
    except:
        return "error"
    
df['statement_string'] = df.apply(lambda x: to_string(x['last_statement']), axis=1)
# y=df['vic_kid'].values
y=df['prior_record'].values
y_labels = list(set(y))
X=df['statement_string'].values

all_df = pd.DataFrame(X)
all_df['labels'] = y
all_df
Out[8]:
0 labels
0 yeah first_person_pronoun want to address the ... yes
1 umm pamela can pronoun hear first_person_prono... yes
2 its on september th kayla and david first_pers... yes
3 hi ladies first_person_pronoun wanted to tell ... yes
4 lord forgive pronoun pronoun dont know what pr... yes
... ... ...
561 i pray that first_person_pronoun family will r... unknown
562 when asked if pronoun had a last statement pro... yes
563 what is about to transpire in a few moments is... no
564 none yes
565 statement to the media first_person_pronoun at... yes

566 rows × 2 columns

In [10]:
# data = get_data_from_files('Dog_Hike/')
# lda_model, lda, lda_vec, cv = run_lda(data,)
from sklearn.feature_extraction import text 
stop_words = text.ENGLISH_STOP_WORDS

# data_fd = get_data_from_files('110/110-f-d/')
# data_fr = get_data_from_files('110/110-f-r/')

# data = data_fd + data_fr
# data


lda_model, lda, lda_vec, cv, corpus = run_lda(all_df[0].values, 4, stop_words)
start_vis(lda, lda_vec, cv)
Topic 0:
[('police', 15.498393763666396), ('officer', 13.293726335860406), ('justice', 8.575686139667397), ('ye', 8.118290540812445), ('coldblooded', 7.2103022374946715), ('hollered', 7.208622163389597), ('equal', 7.191651994116875), ('human', 6.861131532143653), ('shall', 6.746579925770169), ('extend', 6.1953650299434075)]
Topic 1:
[('holy', 10.411963107103821), ('pinkerton', 8.891806924562207), ('live', 5.73994464843442), ('muslim', 4.872920285114169), ('islam', 4.382778789675088), ('express', 4.231767059514237), ('moment', 4.15625504724466), ('allah', 3.95341584327608), ('dungeon', 3.4900957139663564), ('fear', 3.484549451930885)]
Topic 2:
[('black', 10.576045681366006), ('forward', 10.079796718480985), ('lynching', 8.866536281296659), ('america', 8.624390050450529), ('continue', 7.877887015179041), ('happening', 7.764868809289017), ('state', 7.313737231383107), ('marching', 6.300665405590482), ('carry', 6.292434820202825), ('people', 6.108769496284327)]
Topic 3:
[('first_person_pronoun', 4163.850036911061), ('pronoun', 2575.8194241030806), ('love', 605.8537134654808), ('family', 286.6706374354036), ('know', 280.53421731789035), ('thank', 236.92298282232363), ('sorry', 224.59967868722484), ('want', 206.70383682285978), ('god', 189.26574251976234), ('yall', 188.68634502013964)]
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))
In [ ]:
# corpus

# c2 = corpus.append(df.sum().rename('Total'))
ct = corpus.T
ct['total'] = ct.sum(axis=1)
big_total = ct[ct['total'] > 68]
len(big_total)
In [ ]:
len(ct)
In [ ]:
btt = big_total.T
In [63]:
additional_stopwords = btt.columns
In [7]:
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-7-06dc32cfefee> in <module>
      1 from sklearn.feature_extraction import text
      2 
----> 3 stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)

NameError: name 'additional_stopwords' is not defined
In [8]:
stop_words
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-8-07be5ff4bce8> in <module>
----> 1 stop_words

NameError: name 'stop_words' is not defined
In [6]:
lda_model, lda, lda_vec, cv, corpus = run_lda(data, 40, stop_words)
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-6-857357aa498f> in <module>
----> 1 lda_model, lda, lda_vec, cv, corpus = run_lda(data, 40, stop_words)

NameError: name 'stop_words' is not defined
In [71]:
start_vis(lda, lda_vec, cv)
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))
In [11]:
import plotly.plotly as py
from plotly.grid_objs import Grid, Column
from plotly.tools import FigureFactory as FF

import pandas as pd
import time
---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-11-895d92d73ddf> in <module>
----> 1 import plotly.plotly as py
      2 from plotly.grid_objs import Grid, Column
      3 from plotly.tools import FigureFactory as FF
      4 
      5 import pandas as pd

ModuleNotFoundError: No module named 'plotly'
In [ ]: