In [ ]:
 
In [1]:
## =======================================================
## IMPORTING
## =======================================================
import os
def get_data_from_files(path):
    directory = os.listdir(path)
    results = []
    filenames = [file for file in directory]
    for file in directory:
        f=open(path+file)
        results.append(f.read())
        f.close()
    return results, filenames


# neg = get_data_from_files('../NEG_JK/')
# pos = get_data_from_files('../POS_JK/')
In [5]:
import pandas as pd
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer
import os
import gensim

from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
In [6]:
import numpy as np
np.random.seed(2018)

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
In [7]:
data, filenames = get_data_from_files('Dog_Hike/')
data, filenames
Out[7]:
(['Hiking and walking are a bit different. Walking is more often done on roads or streets.\nHiking is more often done on trails or mountains. Different shoes and gear are required in each case.\nHiking shoes can make or break a hike. ',
  'Hiking, or going on long mountain hikes is a great way to relax. Hike with your dog!',
  'Dogs come in many different shapes and sizes. Some people like big dogs. \nSome dogs are smarter, some are better hunters. Some dogs are great with children. ',
  'Dogs are fun. I like dogs. Having many dogs can be fun too.',
  'People who own dogs tend to live longer. Dogs can bring happiness into your life. \nDogs have their own bacterial biome that can help a person to improve their biome. Dogs are also loving, loyal, and great hiking or walking companions.',
  'Hiking is best done with a buddy or group. Hiking alone can be dangerous. When you hike, tell others you plan and schedule. \nHave a method to call or reach out for help. Hiking can expose one to wildlife, weather, and other concerns. \nHike safely. ',
  'Hiking is great exercise. Mountain hiking can involve signficant inclines that will offer an excellent workout for your entire body. \nBring your pets. Dogs love to hike. However, in some areas, there are rules for bringing pets on a hike. Dogs sometimes need to be on a leash. \n',
  'Rescuing dogs is wonderful, fullfilling, and healthy for you and your family. Dogs can add love and companionship.\nNever leave your dog in the car, ever. Dogs need attention. Do not leave your dog home alone or locked in a box. \nThink about how you would like to be treated, and treat your dog the same way. ',
  'Hiking is a lot of fun. To hike, one needs hiking gear. Some hiking gear includes winter gear. Winter hikes can be very fun, but must be safe. \nThink about temperatures and other factors. Never hike alone, especially in winter. Winter can bring strange temperature changes.',
  'Getting a pet, such as a dog, is a big decision. Dogs are like children. They need love, time, attention, and patience.\nDogs like to play and run. They are great exercise conpanions. Dogs need a lot of attention. Paying attention to your dog will offer you and your dog joy.'],
 ['H4.txt',
  'H5.txt',
  'D4.txt',
  'D5.txt',
  'D1.txt',
  'H2.txt',
  'H3.txt',
  'D2.txt',
  'H1.txt',
  'D3.txt'])
In [12]:
MyVectLDA_DH=CountVectorizer()
##path="C:\\Users\\profa\\Documents\\Python Scripts\\TextMining\\DATA\\SmallTextDocs"
Vect_DH = MyVectLDA_DH.fit_transform(data)

Vect_DH
ColumnNamesLDA_DH=MyVectLDA_DH.get_feature_names()
CorpusDF_DH=pd.DataFrame(Vect_DH.toarray(),columns=ColumnNamesLDA_DH)
print(CorpusDF_DH)


lda_model_DH = LatentDirichletAllocation(n_components=2, max_iter=10, learning_method='online')
#lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
LDA_DH_Model = lda_model_DH.fit_transform(Vect_DH)

print("SIZE: ", LDA_DH_Model.shape)  # (NO_DOCUMENTS, NO_TOPICS)

# # Let's see how the first document in the corpus looks like in
# ## different topic spaces
print("First Doc in Dog and Hike data...")
print(LDA_DH_Model[0])
print("Seventh Doc in DOg Hike...")
print(LDA_DH_Model[6])

# ## Print LDA using print function from above
print("LDA Dog and Hike Model:")
print_topics(lda_model_DH, MyVectLDA_DH)


# ####################################################
# ##
# ## VISUALIZATION
# ##
# ####################################################
import pyLDAvis.sklearn as LDAvis
import pyLDAvis
## conda install -c conda-forge pyldavis
# pyLDAvis.enable_notebook() ## not using notebook
panel = LDAvis.prepare(lda_model_DH, Vect_DH, MyVectLDA_DH, mds='tsne')
# pyLDAvis.show(panel)

pyLDAvis.save_html(panel, 'lda.html')


# p = pyLDAvis.gensim.prepare(topic_model, corpus, dictionary)
# pyLDAvis.save_html(p, 'lda.html')
   about  add  alone  also  an  and  are  areas  as  attention  ...  who  \
0      0    0      0     0   0    2    2      0   0          0  ...    0   
1      0    0      0     0   0    0    0      0   0          0  ...    0   
2      0    0      0     0   0    1    3      0   0          0  ...    0   
3      0    0      0     0   0    0    1      0   0          0  ...    0   
4      0    0      0     1   0    1    1      0   0          0  ...    1   
5      0    0      1     0   0    2    0      0   0          0  ...    0   
6      0    0      0     0   1    0    1      1   0          0  ...    0   
7      1    1      1     0   0    4    0      0   0          1  ...    0   
8      1    0      1     0   0    1    0      0   0          0  ...    0   
9      0    0      0     0   0    3    2      0   1          3  ...    0   

   wildlife  will  winter  with  wonderful  workout  would  you  your  
0         0     0       0     0          0        0      0    0     0  
1         0     0       0     1          0        0      0    0     1  
2         0     0       0     1          0        0      0    0     0  
3         0     0       0     0          0        0      0    0     0  
4         0     0       0     0          0        0      0    0     1  
5         1     0       0     1          0        0      0    2     0  
6         0     1       0     0          0        1      0    0     2  
7         0     0       0     0          1        0      1    2     4  
8         0     0       4     0          0        0      0    0     0  
9         0     1       0     0          0        0      0    1     2  

[10 rows x 172 columns]
SIZE:  (10, 2)
First Doc in Dog and Hike data...
[0.98561631 0.01438369]
Seventh Doc in DOg Hike...
[0.98345889 0.01654111]
LDA Dog and Hike Model:
Topic 0:
[('hiking', 10.590256382602332), ('can', 7.746689527904238), ('dogs', 7.693998169798268), ('hike', 6.590403726137431), ('to', 6.454235972557567), ('or', 5.583817941537805), ('and', 5.447533807920999), ('is', 4.848421587766114), ('be', 4.610463402272269), ('are', 4.571465030329856)]
Topic 1:
[('dogs', 8.750217928814761), ('and', 7.03352912235585), ('dog', 6.144479715292222), ('your', 6.114232842028992), ('are', 4.6684629104081905), ('like', 3.9336150311860987), ('some', 3.812303361180029), ('attention', 3.745051556702626), ('to', 3.5689118508987288), ('you', 3.026887167828192)]
/Users/danielcaraway/anaconda3/lib/python3.7/site-packages/pyLDAvis/_prepare.py:257: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  return pd.concat([default_term_info] + list(topic_dfs))