THANK YOU SO MUCH, ALI <3

In [1]:
## =======================================================
## MODELING
## =======================================================
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS

def run_lda(data, num_topics, stop_words):
    regex = "[a-zA-Z]{3,15}"
    cv = CountVectorizer(stop_words = stop_words, token_pattern = regex)
    lda_vec = cv.fit_transform(data)
    lda_columns = cv.get_feature_names()
    corpus = pd.DataFrame(lda_vec.toarray(), columns = lda_columns)
    lda = LatentDirichletAllocation(n_components=num_topics, max_iter=10, 
                                    learning_method='online')
    lda_model = lda.fit_transform(lda_vec)
    print_topics(lda, cv)
    return lda_model, lda, lda_vec, cv, corpus


## =======================================================
## HELPERS
## =======================================================
import numpy as np
np.random.seed(210)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
        

## =======================================================
## VISUALIZING
## =======================================================        
import pyLDAvis.sklearn as LDAvis
import pyLDAvis

def start_vis(lda, lda_vec, cv):
    panel = LDAvis.prepare(lda, lda_vec, cv, mds='tsne')
#     pyLDAvis.show(panel)
    pyLDAvis.save_html(panel, 'HW8_V3_all.html')
In [2]:
from sklearn.feature_extraction import text 
additional_stopwords = [
 '2007',
 '2008',
 'act',
 'american',
 'chairman',
 'committee',
 'congress',
 'country',
 'doc',
 'docno',
 'don',
 'floor',
 'going',
 'government',
 'house',
 'important',
 'just',
 'know',
 'legislation',
 'like',
 'madam',
 'make',
 'members',
 'mr',
 'mrs',
 'ms',
 'need',
 'new',
 'people',
 'president',
 'representatives',
 'say',
 'speaker',
 'state',
 'states',
 'support',
 'text',
 'thank',
 'think',
 'time',
 'today',
 'want',
 'work',
 'year'
]
stop_words = text.ENGLISH_STOP_WORDS.union(additional_stopwords)
In [3]:
a_data = pd.read_csv('HW8_newdata.csv')

# sm_test = a_data['0'].values[:10]
# lda_model, lda, lda_vec, cv, corpus = run_lda(sm_test, 10, stop_words)

lda_model, lda, lda_vec, cv, corpus = run_lda(a_data['0'].values, 40, stop_words)
Topic 0:
[('wage', 2580.9985055827883), ('workers', 2475.1823827792073), ('employees', 2436.123439641345), ('minimum', 2044.6538346412801), ('pay', 1784.2522976225891), ('discrimination', 1230.589706039403), ('employers', 1195.7205545950817), ('labor', 1172.8858894145699), ('women', 1105.6530072860626), ('employee', 1055.131285593534)]
Topic 1:
[('yield', 5595.629735699744), ('gentleman', 4614.842049805483), ('texas', 4177.316215712291), ('carolina', 3746.026251744131), ('member', 2997.2478048044773), ('friend', 2854.239657728002), ('north', 2278.1229144096756), ('colleague', 1972.2021938570897), ('california', 1942.8101279064392), ('south', 1933.3315741362635)]
Topic 2:
[('tax', 18587.67303853016), ('budget', 13019.446683662938), ('billion', 8683.828464021817), ('money', 8255.44527491738), ('spending', 8195.950789139948), ('increase', 6649.155758992434), ('taxes', 6534.317592445948), ('pay', 4881.635455573393), ('percent', 4691.393945908899), ('million', 3982.220498976064)]
Topic 3:
[('district', 4106.559491538709), ('alaska', 1606.0331232366252), ('columbia', 1350.9748140040206), ('voting', 1218.6517151575033), ('vote', 1205.3505490021591), ('abortion', 977.0174189763627), ('vermont', 865.7760353920947), ('citizens', 804.9364891221186), ('right', 642.9899693693989), ('washington', 640.0396248708615)]
Topic 4:
[('health', 18752.489169048975), ('care', 14072.032279563844), ('children', 12790.778775916688), ('insurance', 5302.375093327311), ('program', 4991.826221695516), ('medicare', 4911.224988954452), ('medical', 3529.129665461696), ('kids', 2824.194268564434), ('percent', 2774.195164244307), ('schip', 2646.316183073074)]
Topic 5:
[('housing', 6321.152784764877), ('financial', 3468.0129347931766), ('market', 2535.070754664156), ('credit', 2483.461916708439), ('economic', 2315.199982025768), ('loans', 2285.7898192909715), ('loan', 2195.2206211089933), ('home', 2056.510707319999), ('families', 2036.9199865005603), ('mortgage', 1894.1416037747026)]
Topic 6:
[('religious', 1063.1687585724528), ('church', 987.8850963259397), ('faith', 949.2985012326344), ('missing', 602.5212321639832), ('father', 578.1178345190999), ('music', 575.0879914038865), ('catholic', 509.6278233609482), ('prayer', 401.65232481418656), ('bible', 327.3732123836474), ('store', 244.53759135642676)]
Topic 7:
[('chemical', 558.3331904742313), ('squad', 505.1703231163276), ('osha', 249.5392135029132), ('blm', 244.49268145099384), ('lung', 219.92513406195502), ('meth', 204.94165484988608), ('textile', 166.1600359474555), ('workers', 153.42865841663283), ('transplant', 144.99722553824074), ('colts', 128.03905568659943)]
Topic 8:
[('united', 3184.257512948764), ('iran', 2472.1807504685353), ('israel', 2343.5138639007173), ('resolution', 2100.562197427291), ('world', 1982.076624883069), ('china', 1980.197844359663), ('nuclear', 1767.9504296230214), ('peace', 1351.986520449407), ('democracy', 1281.7641842135713), ('nations', 1151.6734421721287)]
Topic 9:
[('safety', 4495.243237649813), ('safe', 1049.022886803655), ('trail', 741.0486279816777), ('station', 577.1340844909867), ('disabilities', 505.9179273914216), ('england', 436.77395983760425), ('franks', 334.1226126941158), ('ada', 315.27866348077396), ('recreation', 239.05758045441644), ('theory', 236.85374509258892)]
Topic 10:
[('coast', 2487.9048920124924), ('guard', 1870.3421091245411), ('disaster', 1820.074257168552), ('katrina', 1205.2454518287618), ('hurricane', 1165.3928152083918), ('emergency', 1067.7759317849345), ('flood', 1051.513368515889), ('louisiana', 1041.1264774582708), ('corps', 1034.4833030125974), ('gulf', 1018.8439083428983)]
Topic 11:
[('transportation', 3504.987835313319), ('rail', 1520.56745507114), ('infrastructure', 1395.2534546383638), ('oberstar', 1190.032418641437), ('bridge', 1181.7341141167292), ('highway', 1036.2387251443618), ('public', 833.1306988449502), ('transit', 797.497031361822), ('roads', 676.6193978144953), ('amtrak', 672.2769047461688)]
Topic 12:
[('program', 12527.792318816564), ('funding', 7601.965789610266), ('programs', 7197.2147959960475), ('national', 7001.444721181003), ('provide', 5803.47361109179), ('federal', 5380.776139860975), ('million', 5300.640284095675), ('colleagues', 5142.04426889534), ('years', 4702.245206054788), ('urge', 4466.725590202722)]
Topic 13:
[('georgia', 4866.566270239134), ('texas', 4086.051401642588), ('air', 3062.667045516642), ('johnson', 1695.6806361811034), ('force', 1665.6930193572125), ('south', 1034.453768858234), ('southern', 864.4339717797936), ('united', 776.97297639247), ('navy', 720.9041853959961), ('lewis', 631.5660030554877)]
Topic 14:
[('veterans', 9588.044442220116), ('men', 3856.9160714485724), ('women', 3494.275927364191), ('military', 3414.8488235699047), ('families', 3268.2391588226146), ('war', 3214.9017598298055), ('lives', 2917.737929879603), ('service', 2894.24114068789), ('day', 2830.2564426746358), ('nation', 2809.456861912923)]
Topic 15:
[('energy', 29567.94024807562), ('oil', 21081.13439475678), ('gas', 10656.789959228468), ('prices', 6757.945871845981), ('percent', 4935.0125028150405), ('price', 4673.31040254654), ('fuel', 3951.65171255913), ('natural', 3782.5018255993973), ('use', 3726.6429906193925), ('production', 3587.7741468226254)]
Topic 16:
[('wood', 454.1744424397413), ('produces', 374.03331174081137), ('sands', 293.2816410951835), ('lahood', 151.1939022465631), ('noaa', 142.90433778555925), ('lafayette', 142.303550482649), ('organic', 128.13825315777598), ('brian', 125.36499991481237), ('loretta', 122.04696343247842), ('cattle', 120.75635885750233)]
Topic 17:
[('security', 7370.955665451803), ('border', 4450.2317847559325), ('homeland', 3449.551409977423), ('department', 2592.531840544908), ('city', 2253.101052656351), ('illegal', 2201.6634265120515), ('mexico', 2034.463228561999), ('united', 1937.2723490740116), ('immigration', 1914.7209735331205), ('iowa', 1825.105512039368)]
Topic 18:
[('motion', 3055.1161755023522), ('green', 1363.3419502700156), ('wild', 754.8692042923012), ('fda', 497.9907860244099), ('kingston', 454.0284196034026), ('valley', 364.4334325228004), ('horses', 262.0010439136225), ('petri', 200.0921469265163), ('visclosky', 197.03583711726623), ('animals', 192.40364206661135)]
Topic 19:
[('said', 12705.102274692115), ('things', 10212.644276493642), ('come', 10080.663930902147), ('way', 9892.124762285126), ('right', 8723.840183731563), ('good', 8601.01579625902), ('really', 8450.522057418964), ('years', 8285.741428225194), ('look', 8097.157865833423), ('let', 7885.753401902667)]
Topic 20:
[('property', 1674.3509543373896), ('connecticut', 1625.0463900315176), ('murphy', 1354.875914050051), ('amnesty', 856.1354315418512), ('maryland', 814.7116346330107), ('source', 792.4001185287194), ('altmire', 702.3247450180542), ('illegally', 619.7733282716255), ('van', 552.6255972110423), ('site', 535.5285294602817)]
Topic 21:
[('york', 4545.959780451036), ('bishop', 1323.9163175372596), ('utah', 1229.5233962258196), ('kentucky', 1063.677727420043), ('mining', 676.9665488125693), ('mccarthy', 519.7030028149965), ('mcdermott', 308.85471541559093), ('tiahrt', 306.7549962582717), ('dust', 294.15405850279257), ('logic', 280.1054019428094)]
Topic 22:
[('intelligence', 4938.037501379657), ('foreign', 4726.716487647771), ('united', 4150.845450850457), ('world', 3525.143019048818), ('security', 2474.9637297341237), ('terrorist', 1997.0241649168279), ('countries', 1937.7952627311724), ('jersey', 1864.0540814550175), ('america', 1852.8193179685763), ('attack', 1832.5552793482884)]
Topic 23:
[('rural', 2786.797181235094), ('land', 1626.5806732431738), ('community', 1485.2961838365832), ('communities', 1335.3662079753394), ('native', 1316.502063976478), ('san', 1304.3393877967253), ('kansas', 1085.5286678755017), ('counties', 978.4163138353433), ('areas', 921.645710249913), ('indian', 824.9915683472125)]
Topic 24:
[('fisa', 1221.9668573233698), ('surveillance', 1116.5947511936893), ('agriculture', 980.0173695344029), ('product', 723.9793989997103), ('barton', 710.2984370654107), ('nutrition', 647.296726838412), ('dingell', 447.8809879815333), ('commodity', 395.4233044002986), ('animal', 352.868493404401), ('kline', 347.6399915253937)]
Topic 25:
[('westmoreland', 517.5886981581757), ('french', 351.077876059311), ('mahoney', 149.9048708904191), ('universe', 143.60710817366896), ('dam', 122.62718117628683), ('grande', 119.00719323279547), ('parcel', 114.69382371478505), ('scare', 106.74886055705956), ('culberson', 103.936845868432), ('sentencing', 99.45349147999362)]
Topic 26:
[('university', 2812.2223567651918), ('school', 2680.3217696452944), ('service', 2221.0002287455986), ('space', 2129.3236479033526), ('women', 1940.864386332328), ('illinois', 1914.9175411333345), ('team', 1821.2914945089112), ('office', 1682.0512381699612), ('davis', 1509.998314940875), ('community', 1462.9821503721807)]
Topic 27:
[('education', 6141.300518732915), ('students', 4776.48065214607), ('law', 4161.430455216668), ('college', 3857.8262095905675), ('court', 3590.368635397995), ('federal', 3466.6893205255246), ('school', 2897.608299346429), ('schools', 2699.66292534491), ('amendment', 2668.1633370939703), ('information', 2613.511447015368)]
Topic 28:
[('ohio', 3435.6303296478277), ('county', 2970.938044882421), ('jones', 1071.7867444063409), ('judge', 761.2776093466357), ('district', 615.8826102151768), ('boustany', 408.3902405324127), ('yarmuth', 407.3582715014089), ('stephanie', 393.2630960720095), ('arcuri', 360.96023914088744), ('cleveland', 301.13586957867744)]
Topic 29:
[('research', 6419.435549507337), ('small', 4167.712095232725), ('science', 3921.3699214530075), ('business', 2622.1180256161524), ('stem', 2529.8401402811246), ('technology', 2373.5344850165293), ('businesses', 2207.9928203419104), ('cell', 2130.4280005165574), ('cells', 1241.9911747474234), ('scientific', 972.2960176706125)]
Topic 30:
[('trade', 5932.399957290384), ('jobs', 4954.722714954009), ('food', 4357.707376731886), ('agreement', 3590.783656700152), ('farm', 2753.7725930988486), ('economy', 2590.72074242193), ('free', 2281.7109603576837), ('workers', 2113.886041762944), ('america', 2091.9765894015995), ('united', 2016.2521661306996)]
Topic 31:
[('transfer', 614.8939569988037), ('oklahoma', 597.8571647730786), ('agent', 463.03329668841303), ('ramos', 401.54397611930244), ('issa', 400.3340925159739), ('corrections', 374.65730574050247), ('management', 369.84057451481254), ('smuggler', 365.9567725437547), ('compean', 355.10878999022526), ('owned', 296.53822711188906)]
Topic 32:
[('title', 2533.0166432108053), ('minnesota', 1877.9358377173985), ('read', 1676.5893167352529), ('rules', 1551.4285208220529), ('amended', 1023.7021835763495), ('clerk', 970.9748862695516), ('suspend', 927.5619877251007), ('amend', 771.8029870648969), ('pass', 689.7275645493967), ('cbo', 607.8745061210735)]
Topic 33:
[('amendment', 17727.10283440854), ('appropriations', 6373.938895013601), ('gentleman', 6226.389069464795), ('water', 5479.6462497634875), ('related', 3072.7374661382346), ('agencies', 3004.547309034653), ('yield', 2459.0369065999334), ('consideration', 2259.184870082016), ('june', 2103.7362534468166), ('development', 2099.354151719014)]
Topic 34:
[('iraq', 22158.046137193523), ('war', 13981.840836950492), ('troops', 10016.36233900081), ('military', 6356.356653656073), ('iraqi', 4011.8312870796294), ('defense', 3094.082904742171), ('forces', 2848.2552415216596), ('resolution', 2700.9623740073653), ('afghanistan', 2652.479506198072), ('general', 2528.1240367857567)]
Topic 35:
[('majority', 10814.958379109485), ('vote', 10355.464417808234), ('democrats', 7254.890431432896), ('process', 6702.266239333554), ('debate', 6505.564091612537), ('senate', 6401.864806828942), ('republican', 6084.27503507255), ('rules', 6010.654545916111), ('rule', 5812.538551840076), ('republicans', 5318.89572517263)]
Topic 36:
[('resolution', 7150.14057594968), ('national', 3951.587343418001), ('res', 2372.1394313778414), ('day', 1917.5231816403902), ('virginia', 1772.5962664110866), ('week', 1495.676436931729), ('awareness', 1081.2693033024507), ('recognizing', 1065.089690178554), ('supporting', 1033.0112901383313), ('forest', 1008.0908580197269)]
Topic 37:
[('nation', 3812.041062026645), ('history', 3766.638225351563), ('america', 3730.611465404947), ('great', 3706.0696206696384), ('life', 3195.4554460357676), ('rights', 3060.644728127458), ('years', 2921.9735556676387), ('americans', 2595.95266809486), ('king', 2580.6923493984523), ('freedom', 2492.3686017105283)]
Topic 38:
[('law', 3618.322940228155), ('enforcement', 2544.2065014268132), ('justice', 2128.7553169260946), ('crime', 2082.9848267093575), ('victims', 1873.5559297324983), ('violence', 1598.109519572716), ('rights', 1397.2463715461097), ('criminal', 1333.7259923998809), ('crimes', 1261.3691859335854), ('police', 1203.8907452399537)]
Topic 39:
[('internet', 1213.4523847850098), ('sir', 200.07191062033093), ('appalachian', 192.79000003694915), ('shea', 181.47004217545043), ('pornography', 166.34135238976387), ('ron', 144.78958192352243), ('corrine', 133.891534020901), ('arc', 133.09739243651822), ('whaling', 120.51094206069912), ('archives', 97.16155412772288)]
In [4]:
start_vis(lda, lda_vec, cv)
C:\Users\ho511\Anaconda3\lib\site-packages\pyLDAvis\_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))