HW 8: Topic Modeling with LDA

In [1]:
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    for file in directory:
        f=open(path+file,  encoding = "ISO-8859-1")
        results.append(f.read())
        f.close()
    return results


## =======================================================
## MODELING
## =======================================================
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def run_lda(data,n_components, cv):
    lda_vec = cv.fit_transform(data)
    lda_columns = cv.get_feature_names()
    corpus = pd.DataFrame(lda_vec.toarray(), columns = lda_columns)
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, 
                                    learning_method='online')
    lda_model = lda.fit_transform(lda_vec)
    print_topics(lda, cv)
    return lda_model, lda, lda_vec, cv


## =======================================================
## HELPERS
## =======================================================
import numpy as np
np.random.seed(210)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        

## =======================================================
## VISUALIZING
## =======================================================        
import pyLDAvis.sklearn as LDAvis
import pyLDAvis

def start_vis(lda, lda_vec, cv):
    panel = LDAvis.prepare(lda, lda_vec, cv, mds='tsne')
#     pyLDAvis.show(panel)
    pyLDAvis.save_html(panel, 'HW8_lda_all_2.html')
In [2]:
# DATA SET 1
# data = get_data_from_files('Dog_Hike/')
# lda_model, lda, lda_vec, cv = run_lda(data)
In [3]:
# DATA SET 2
data_fd = get_data_from_files('110/110-f-d/')
data_fr = get_data_from_files('110/110-f-r/')
data_md = get_data_from_files('110/110-m-d/')
data_mr = get_data_from_files('110/110-m-r/')

female_data = data_fd + data_fr 
male_data = data_md + data_mr
dem_data = data_md + data_fd
rep_data = data_mr + data_fr

all_data = female_data + male_data

# DATA SET 2 -- SMALL
female_data_sm = data_fd[:10] + data_fr[:10] 
male_data_sm = data_md[:10] + data_mr[:10]
dem_data = data_md[:10] + data_fd[:10]
rep_data = data_mr[:10] + data_fr[:10]

all_data_sm = female_data_sm + male_data_sm

TEST 1: Starting Point

In [4]:
cv = CountVectorizer()
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
Topic 0:
[('the', 785.0256064179524), ('to', 645.6508076986827), ('of', 563.3439173262947), ('and', 542.7164636391508), ('that', 445.9161141427595), ('we', 267.91064217185743), ('in', 259.49073846656427), ('is', 246.60000754980436), ('this', 224.9582831133843), ('it', 190.17402853199783)]
Topic 1:
[('and', 209.6750086441383), ('the', 192.9853786586076), ('of', 170.17785974470274), ('to', 169.70137718746872), ('that', 144.5421140279574), ('in', 113.54346252014587), ('we', 83.96767998571751), ('for', 75.05958453892264), ('have', 73.0637394952233), ('this', 64.931179971705)]
Topic 2:
[('the', 350.10927536238313), ('to', 197.89358554078555), ('of', 146.67751590105448), ('in', 145.35814968113394), ('that', 125.02685414884509), ('and', 124.73104608809528), ('for', 87.16774469308824), ('this', 70.90221338618404), ('have', 68.46045981739343), ('are', 62.314452707063005)]
Topic 3:
[('the', 71961.31057183248), ('to', 44705.23814674544), ('of', 41455.018971298385), ('and', 39039.12153767716), ('that', 26890.762686276157), ('in', 24309.698791032286), ('we', 17147.456346668612), ('is', 16298.293847996309), ('this', 15457.141160161287), ('for', 13903.32208440825)]

TEST 2: Removing Stopwords

In [5]:
cv = CountVectorizer(stop_words='english')
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
Topic 0:
[('mr', 3293.975742344425), ('doc', 2903.989863420596), ('text', 2801.4284727251625), ('docno', 2787.096517516463), ('house', 2491.3656167861145), ('speaker', 1936.7215609139132), ('people', 1880.9648493388456), ('american', 1558.8122415826244), ('representatives', 1508.5406616758924), ('time', 1445.4153209492938)]
Topic 1:
[('mr', 1443.8069009953228), ('going', 949.510530729654), ('house', 773.5675529576582), ('congress', 737.9341648465683), ('think', 722.0791364905554), ('speaker', 700.5863599632792), ('president', 691.3103457691248), ('know', 670.4932338000776), ('want', 665.1242629553672), ('people', 663.164365181927)]
Topic 2:
[('mr', 59.28095347111776), ('doc', 51.89402995488438), ('docno', 48.53447125156012), ('text', 35.85343160861242), ('speaker', 34.049654220877876), ('house', 31.122051220366703), ('people', 24.68145897680303), ('2007', 23.92969861080877), ('representatives', 23.379062772686584), ('act', 21.14284614431352)]
Topic 3:
[('mr', 3613.724855112024), ('text', 2935.1442433738835), ('docno', 2891.163300808986), ('doc', 2803.7861374550125), ('house', 2183.9229859117095), ('speaker', 1845.0051493694714), ('representatives', 1578.2613694988454), ('act', 1550.9527400895213), ('time', 1496.273217639627), ('support', 1481.6126149688046)]

TEST 3: Removing Additional Stopwords

In [6]:
from sklearn.feature_extraction import text 
additional_stopwords = ['mr', 'docno', 'house', 
                        'speaker', 'text', 'congress', 'representatives']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)

cv = CountVectorizer(stop_words=stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
Topic 0:
[('going', 1223.0794213891406), ('think', 860.919891930581), ('want', 806.2802834714961), ('president', 777.5285681797303), ('know', 767.5018698741118), ('members', 764.1493510813899), ('people', 750.5861569902328), ('just', 720.1551397910393), ('important', 652.8674369137636), ('american', 645.5669007150559)]
Topic 1:
[('doc', 492.42201870737074), ('time', 397.20357085486114), ('ms', 342.7829248457909), ('support', 302.8679927319846), ('united', 296.5425651983603), ('people', 295.27235945708003), ('lehtinen', 292.6371834944193), ('ros', 288.2906377961998), ('states', 250.1969200834086), ('2007', 241.80896081436128)]
Topic 2:
[('doc', 2823.632847825793), ('people', 1710.2262130192466), ('act', 1433.5146752331345), ('time', 1372.7149807390128), ('2007', 1330.628201919425), ('american', 1220.7498374937923), ('energy', 1209.643578081836), ('support', 1143.699753548295), ('ms', 1096.2574658725252), ('today', 1069.482941383811)]
Topic 3:
[('doc', 2383.5729072806735), ('mrs', 1429.9399136208651), ('2007', 1212.198115838467), ('time', 1159.618487288165), ('act', 1118.6419813221924), ('people', 1014.5251239199333), ('support', 1010.3796061815099), ('today', 955.9342316008651), ('american', 844.8905076582939), ('year', 748.9760666328012)]

TEST 4: Removing even more stopwords

In [7]:
from sklearn.feature_extraction import text 
additional_stopwords = ['mr', 'docno', 'house', 
                        'speaker', 'text', 'congress', 'representatives', 
                        'doc', 'time', 'want', 'today', '2007']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)

cv = CountVectorizer(stop_words=stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
Topic 0:
[('support', 32.905659600631736), ('american', 32.16964698879218), ('going', 32.036298544827645), ('just', 31.2133735957517), ('mrs', 30.820314702519457), ('people', 30.44554640882069), ('think', 29.608185251004205), ('act', 27.86855501192981), ('ms', 27.006672641782377), ('chairman', 25.690327259345892)]
Topic 1:
[('going', 1577.4316943010276), ('people', 1198.9148188132745), ('know', 1193.079392138637), ('american', 1100.6620372217196), ('think', 1082.7233852247489), ('just', 992.918926931911), ('mrs', 893.8156720522584), ('president', 856.8484276454437), ('members', 818.3811372587752), ('say', 790.8846707722173)]
Topic 2:
[('people', 2528.4169798402877), ('act', 2302.775033485123), ('support', 2144.682715000717), ('ms', 1837.8462279920489), ('american', 1714.9084127335182), ('2008', 1493.3555849781776), ('energy', 1487.7742875873405), ('country', 1483.7193272898774), ('chairman', 1474.037178080695), ('new', 1401.9773848314558)]
Topic 3:
[('act', 22.94664241586246), ('support', 21.497828804618532), ('going', 15.94381022299248), ('american', 15.675557677179963), ('ms', 15.606433165206985), ('think', 15.2461639824401), ('important', 14.736610471171419), ('like', 14.412796009685895), ('energy', 14.08269856172683), ('chairman', 14.05336746096302)]

TEST 5: Adding ngrams, removing even more stopwords

In [8]:
from sklearn.feature_extraction import text 
additional_stopwords = ['mr', 'docno', 'house', 
                        'speaker', 'text', 'congress', 'representatives', 
                        'doc', 'time', 'want', 'today', '2007', 'support', 'american', 
                        'president', 'ms', 'mrs', 'going', 'think', 'just', 'know', 
                        'make', 'people']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)

cv = CountVectorizer(ngram_range = (1,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
Topic 0:
[('act', 36.66768279270877), ('legislation', 31.070329962489307), ('thank', 28.800527749526104), ('energy', 27.717882874025612), ('states', 26.56695077929489), ('new', 26.51192900533049), ('chairman', 25.907652378757465), ('state', 25.554054301940976), ('work', 25.434945910787366), ('iraq', 25.115253945396947)]
Topic 1:
[('act', 31.612490694934145), ('important', 26.736546127449667), ('2008', 26.37948623500332), ('iraq', 25.109750087949482), ('energy', 23.492239563506164), ('legislation', 23.35996683600637), ('work', 23.310251300327558), ('need', 22.474196453084136), ('children', 22.47097667332001), ('years', 21.385513131529684)]
Topic 2:
[('members', 824.0505496389474), ('important', 765.6314484137072), ('say', 707.2542727357181), ('floor', 633.2985843928423), ('florida', 546.9139022950272), ('health', 535.7953945001675), ('iraq', 534.0588353672365), ('care', 530.8908230252921), ('need', 528.3923653890445), ('like', 523.8641693579322)]
Topic 3:
[('act', 2432.6609537332497), ('energy', 1621.4265716042808), ('2008', 1610.0131321230358), ('chairman', 1535.6103817602768), ('year', 1521.416880654621), ('country', 1475.5741701996863), ('new', 1435.1806639999768), ('years', 1405.7550672255463), ('legislation', 1378.4426264568121), ('states', 1362.3151674216404)]

TEST 6: Forcing bigrams with original stopword list

In [9]:
from sklearn.feature_extraction import text 
additional_stopwords = ['mr', 'docno', 'house', 
                        'speaker', 'text', 'congress', 'representatives', 
                        'doc', 'time', 'want', 'today', '2007', 'support', 'american', 
                        'president', 'ms', 'mrs', 'going', 'think', 'just', 'know', 
                        'make', 'people']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)

cv = CountVectorizer(ngram_range = (2,2), stop_words = "english")
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
Topic 0:
[('mr speaker', 632.5349408276707), ('american people', 485.5510629318502), ('think important', 447.9457027153469), ('mr meek', 440.3107796529081), ('meek florida', 438.72173832899045), ('house representatives', 382.97412635757473), ('make sure', 309.164785979551), ('docno text', 305.147959904683), ('working group', 305.0524233920746), ('doc doc', 301.3262271699989)]
Topic 1:
[('house representatives', 50.97188635507018), ('doc docno', 42.769248255056624), ('mr speaker', 41.148567987312866), ('doc doc', 40.84180810775652), ('text doc', 39.578877530676586), ('docno text', 37.53809193126818), ('mrs emerson', 24.586193677070494), ('2007 docno', 24.310674299417535), ('text mr', 20.746781420609015), ('docno mrs', 19.32900027780685)]
Topic 2:
[('mr speaker', 126.35847067979593), ('text doc', 103.61902259209496), ('house representatives', 97.77531484638467), ('mr smith', 96.12778028812302), ('smith nebraska', 95.954222917317), ('doc docno', 95.92069830251347), ('doc doc', 91.52929164473929), ('docno text', 86.44795970464406), ('docno mr', 62.658149787268485), ('text mr', 56.71216325678556)]
Topic 3:
[('house representatives', 2827.3794762422403), ('docno text', 2674.460128152421), ('doc docno', 2669.916043680545), ('text doc', 2667.874768197915), ('doc doc', 2637.738900648014), ('mr speaker', 2408.779675859461), ('2007 docno', 1702.8704058200706), ('text mr', 1119.148719797673), ('docno mr', 1115.978127370457), ('united states', 1026.6784978488581)]

TEST 7: Forcing bigrams with updated stopword list

In [10]:
from sklearn.feature_extraction import text 
additional_stopwords = ['mr', 'docno', 'house', 
                        'speaker', 'text', 'congress', 'representatives', 
                        'doc', 'time', 'want', 'today', '2007', 'support', 'american', 
                        'president', 'ms', 'mrs', 'going', 'think', 'just', 'know', 
                        'make', 'people']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)

cv = CountVectorizer(ngram_range = (2,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 4, cv)
Topic 0:
[('ros lehtinen', 287.99674571136654), ('united states', 212.88517979724236), ('yield consume', 95.36334280112975), ('human rights', 94.8310842547404), ('balance ros', 74.09296317153331), ('reserve balance', 61.47549028214522), ('2008 ros', 50.340916667528404), ('lehtinen yield', 46.81060362199631), ('urge colleagues', 46.14316967247467), ('united nations', 42.50270522503713)]
Topic 1:
[('intelligence community', 51.416320979991916), ('united states', 48.861232790683346), ('intelligence authorization', 37.814042876735876), ('fiscal year', 37.7356186429753), ('authorization act', 35.2608918736775), ('act fiscal', 32.68785520063953), ('urge colleagues', 27.182605615462784), ('year 2008', 23.748039708127337), ('2008 reyes', 23.374774560433746), ('men women', 23.329937798969173)]
Topic 2:
[('meek florida', 431.7679323644925), ('working group', 301.1721713543166), ('30 working', 290.61206564754843), ('health care', 246.172378667399), ('florida 30', 183.52869918600052), ('men women', 166.6698494458708), ('united states', 157.05077457149477), ('come floor', 146.09454860653585), ('making sure', 123.27207441064166), ('new direction', 122.30973995889485)]
Topic 3:
[('united states', 830.120488056721), ('health care', 575.4112460113006), ('act 2008', 434.94576322654217), ('urge colleagues', 415.5168665094916), ('men women', 343.6982271182687), ('thank gentleman', 335.853897025432), ('federal government', 275.2428626934973), ('yield consume', 268.9542613469333), ('jones ohio', 256.6152986748394), ('reserve balance', 245.09773932316727)]

TEST 8: Expanding the number of "topics"

In [11]:
from sklearn.feature_extraction import text 
additional_stopwords = ['mr', 'docno', 'house', 
                        'speaker', 'text', 'congress', 'representatives', 
                        'doc', 'time', 'want', 'today', '2007', 'support', 'american', 
                        'president', 'ms', 'mrs', 'going', 'think', 'just', 'know', 
                        'make', 'people']
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)

cv = CountVectorizer(ngram_range = (1,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 10, cv)
Topic 0:
[('work', 13.793059995499286), ('important', 13.76604273636567), ('act', 13.419789130317374), ('new', 12.914807584599824), ('members', 11.19910818147834), ('2008', 10.84333360880385), ('year', 10.707137102124278), ('like', 10.088574848446312), ('country', 10.051885123102622), ('need', 10.01031227172015)]
Topic 1:
[('act', 16.041556963677937), ('country', 10.729368181179112), ('states', 10.650578851005351), ('need', 10.603701731897639), ('like', 10.413134054267427), ('new', 10.230674737502392), ('years', 10.13819558251737), ('way', 9.93888802854641), ('iraq', 9.90717281191424), ('members', 9.860314264384956)]
Topic 2:
[('act', 2713.6999413047133), ('important', 1930.6948605105235), ('country', 1883.1870111929861), ('new', 1842.8807471118369), ('2008', 1802.33938646033), ('energy', 1778.5565138571055), ('need', 1762.5379904311105), ('iraq', 1761.984811866405), ('year', 1744.3987767420238), ('legislation', 1707.4305976465012)]
Topic 3:
[('act', 15.155234253650228), ('new', 12.666073929623048), ('year', 11.845172150300439), ('energy', 11.069639979148002), ('important', 10.93396789442089), ('thank', 10.908820911778799), ('years', 9.833527316055916), ('iraq', 9.813196437384686), ('united', 9.6594906706931), ('2008', 9.487692017985761)]
Topic 4:
[('act', 28.118089363778864), ('states', 22.569538051972504), ('united', 22.15151953963809), ('iraq', 21.296216268080116), ('country', 20.155592223462577), ('thank', 19.46879187220822), ('chairman', 19.06162940261678), ('ros lehtinen', 18.650312047792724), ('2008', 18.2624978726425), ('resolution', 18.23370406072209)]
Topic 5:
[('act', 15.951013587525683), ('chairman', 15.056507144625868), ('2008', 14.403812726796247), ('thank', 13.731474454023038), ('like', 13.230855602952436), ('work', 13.037335184930326), ('new', 12.117244698001675), ('iraq', 11.938644857982549), ('legislation', 11.141192962423096), ('year', 11.028360838629808)]
Topic 6:
[('act', 16.879590397688872), ('like', 11.688163721301796), ('legislation', 10.76032008150553), ('health', 10.676334209813014), ('chairman', 10.138702316765912), ('years', 9.415429751114017), ('important', 9.273169040389982), ('states', 9.259411093194316), ('year', 9.206538429428054), ('country', 9.127307496217531)]
Topic 7:
[('act', 14.030049233832814), ('need', 9.7061281330386), ('thank', 8.80905607835847), ('year', 8.73517099627621), ('legislation', 8.62736002401648), ('years', 8.584510569902532), ('important', 8.570163616623374), ('health', 8.37066177068542), ('states', 8.176558707270507), ('2008', 8.07729570855648)]
Topic 8:
[('act', 39.350773376753864), ('country', 25.55947887175521), ('year', 22.269379717957843), ('chairman', 21.500977582131103), ('states', 20.620174638874648), ('energy', 20.593833412392144), ('iraq', 19.607361764193275), ('work', 19.552350624061376), ('years', 18.222282773181643), ('need', 18.139569210736752)]
Topic 9:
[('energy', 12.455685084580635), ('act', 10.387544496758242), ('new', 9.919236694563685), ('tax', 9.677072779076331), ('years', 9.633499540889447), ('country', 9.11970123706493), ('important', 8.843057919354466), ('legislation', 8.724643393993052), ('year', 8.560082355635299), ('2008', 8.369028698110773)]

TEST 9: Removing Even More Stopwords

In [12]:
from sklearn.feature_extraction import text 
additional_stopwords = [
 '2007',
 '2008',
 'act',
 'american',
 'chairman',
 'committee',
 'congress',
 'country',
 'doc',
 'docno',
 'don',
 'floor',
 'going',
 'government',
 'house',
 'important',
 'just',
 'know',
 'legislation',
 'like',
 'madam',
 'make',
 'members',
 'mr',
 'mrs',
 'ms',
 'need',
 'new',
 'people',
 'president',
 'representatives',
 'say',
 'speaker',
 'state',
 'states',
 'support',
 'text',
 'thank',
 'think',
 'time',
 'today',
 'want',
 'work',
 'year'
]
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)

cv = CountVectorizer(ngram_range = (1,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data_sm, 10, cv)
Topic 0:
[('energy', 1765.3356449980315), ('iraq', 1714.6507840290355), ('years', 1652.4069549930955), ('health', 1562.106165609075), ('children', 1456.0006219143443), ('colleagues', 1340.1383761100606), ('tax', 1339.5691631891314), ('national', 1277.514421509431), ('care', 1265.2391788785442), ('said', 1258.442467174732)]
Topic 1:
[('ros lehtinen', 237.0685528344471), ('lehtinen', 225.37293294649396), ('ros', 225.1347499414351), ('united', 94.9788698004813), ('lehtinen yield', 75.01904327327165), ('iran', 67.5381317258774), ('florida', 64.83287346361791), ('resolution', 63.01080217738324), ('iraq', 61.46882630806606), ('balance ros', 57.77931673629426)]
Topic 2:
[('energy', 12.1541576615457), ('iraq', 9.734191420342404), ('colleagues', 8.507005660912085), ('national', 8.054744392564626), ('years', 7.726739295506307), ('children', 7.193657333118239), ('americans', 7.145725926178417), ('money', 7.110969926426708), ('america', 6.842671870865755), ('families', 6.829856703051992)]
Topic 3:
[('years', 10.673163126097423), ('iraq', 10.405974512660416), ('united', 9.318625511209426), ('children', 8.862647572286914), ('amendment', 8.233533648721869), ('health', 8.209657890888913), ('national', 8.067410131586898), ('vote', 7.920969412066217), ('million', 7.7291485389213195), ('energy', 7.319163910747757)]
Topic 4:
[('years', 9.936405894601808), ('health', 9.073878089195453), ('energy', 8.445844328553985), ('tax', 8.360775357282732), ('colleagues', 7.879296684221084), ('children', 7.207532062529005), ('national', 7.02256751982236), ('iraq', 7.01406240136335), ('day', 6.808135647304428), ('security', 6.72353992512044)]
Topic 5:
[('tax', 8.672601132973154), ('health', 8.653134234255871), ('iraq', 7.524900453185688), ('united', 7.504503459585607), ('percent', 7.201434579133791), ('right', 7.077860907892128), ('said', 6.984059745162231), ('children', 6.647839782700351), ('years', 6.525051158114006), ('america', 6.5054167523521755)]
Topic 6:
[('energy', 27.332773323261744), ('iraq', 20.57088808669758), ('years', 19.971577282645924), ('tax', 19.553546267678758), ('health', 19.349854645831453), ('children', 18.957246431091022), ('united', 18.548542573513963), ('percent', 18.420402998236717), ('great', 17.976136599478775), ('national', 17.775622760697754)]
Topic 7:
[('iraq', 11.067802282117137), ('years', 10.636273201206663), ('health', 9.989511693374645), ('national', 9.20869973072133), ('energy', 8.691067589066026), ('war', 8.429096976314407), ('said', 8.19836082254055), ('united', 8.074614622228061), ('federal', 8.055613130078656), ('colleagues', 7.886078947937085)]
Topic 8:
[('iraq', 11.964298654015725), ('years', 11.493111286120953), ('energy', 10.655594027889745), ('children', 10.483042085543381), ('way', 9.894276255337934), ('care', 9.11659341209796), ('tax', 8.852003378804378), ('national', 8.525223352811317), ('working', 8.330635873189685), ('colleagues', 8.223063958991043)]
Topic 9:
[('energy', 10.413222137610388), ('health', 9.60589693824922), ('iraq', 9.539767380112075), ('care', 9.1962974138576), ('tax', 8.650530214547592), ('children', 8.521392791864116), ('said', 7.900106401353153), ('america', 7.6471760164072125), ('united', 7.2039278405473475), ('colleagues', 7.081099056080777)]
In [13]:
additional_stopwords = ['mr', 'docno', 'house', 
                        'speaker', 'text', 'congress', 'representatives', 
                        'doc', 'time', 'want', 'today', '2007', 'support', 'american', 
                        'president', 'ms', 'mrs', 'going', 'think', 'just', 'know', 
                        'make', 'people', 'act', 'thank', 'need', 'work', 'country',
                       'say', 'year', 'chairman', 'new', 'important','2008',
                       'legislation', 'members', 'state', 'states', 'madam', 'floor', 
                        'like', 'don', 'government', 'committee']
In [14]:
sorted(additional_stopwords)
Out[14]:
['2007',
 '2008',
 'act',
 'american',
 'chairman',
 'committee',
 'congress',
 'country',
 'doc',
 'docno',
 'don',
 'floor',
 'going',
 'government',
 'house',
 'important',
 'just',
 'know',
 'legislation',
 'like',
 'madam',
 'make',
 'members',
 'mr',
 'mrs',
 'ms',
 'need',
 'new',
 'people',
 'president',
 'representatives',
 'say',
 'speaker',
 'state',
 'states',
 'support',
 'text',
 'thank',
 'think',
 'time',
 'today',
 'want',
 'work',
 'year']

TEST 10: ALL documents across 40 topics

In [ ]:
from sklearn.feature_extraction import text 
additional_stopwords = [
 '2007',
 '2008',
 'act',
 'american',
 'chairman',
 'committee',
 'congress',
 'country',
 'doc',
 'docno',
 'don',
 'floor',
 'going',
 'government',
 'house',
 'important',
 'just',
 'know',
 'legislation',
 'like',
 'madam',
 'make',
 'members',
 'mr',
 'mrs',
 'ms',
 'need',
 'new',
 'people',
 'president',
 'representatives',
 'say',
 'speaker',
 'state',
 'states',
 'support',
 'text',
 'thank',
 'think',
 'time',
 'today',
 'want',
 'work',
 'year'
]
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)

cv = CountVectorizer(ngram_range = (1,2), stop_words = stop_words)
lda_model, lda, lda_vec, cv = run_lda(all_data, 40, cv)
Topic 0:
[('children', 1.7587778397076301), ('years', 1.5666750618727092), ('iraq', 1.556915690722305), ('health', 1.3496488015112555), ('national', 1.3428709308529294), ('war', 1.3164673291993687), ('energy', 1.2767949403229732), ('oil', 1.261727098817472), ('america', 1.2571810537715866), ('colleagues', 1.2255317040143905)]
Topic 1:
[('iraq', 2.569925360636842), ('years', 2.2865516090727587), ('national', 1.6473154283379028), ('health', 1.547527073303042), ('war', 1.4914948204485374), ('security', 1.473762810653364), ('nation', 1.4716302711848401), ('united', 1.4570678518581972), ('children', 1.4214651233460969), ('percent', 1.389139376099727)]
Topic 2:
[('south mississippi', 5.168997486243519), ('children', 2.6842082558596587), ('taylor mississippi', 2.543477173067218), ('iraq', 2.3722853348636126), ('national', 2.154144508312056), ('million', 2.020630031644396), ('energy', 1.9381247541127267), ('war', 1.8354254445147145), ('years', 1.8317975840417466), ('colleagues', 1.7775782888656277)]
Topic 3:
[('years', 1.9239437502808472), ('national', 1.7486861130854656), ('energy', 1.6223811044758671), ('iraq', 1.4978903806661452), ('amendment', 1.435378288868421), ('children', 1.3764643666292256), ('health', 1.3447813716248922), ('war', 1.2873271928923584), ('united', 1.2837985307634543), ('billion', 1.250692133300797)]
Topic 4:
[('rahall', 38.90973094688447), ('balance rahall', 31.395654582337233), ('rahall yield', 17.578465642250283), ('29 rahall', 12.991697553329114), ('title rahall', 12.729981210717925), ('04 rahall', 11.868671044725543), ('16 rahall', 11.441685144252968), ('pending measure', 10.015804733927633), ('rahall suspend', 9.955667212023354), ('rahall comprehensive', 9.448341736011537)]
Topic 5:
In [ ]:
start_vis(lda, lda_vec, cv)
In [ ]:
 
In [ ]:
 
In [ ]: