In [2]:
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
## For Stemming
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import os
#--------------------------------------
## Tokenization
## Breaking text into tokens -  in this case - words
#-----------------------------------------------------
text="This is any sentence of text. It can have punctuation, CAPS!, etc."
tokenized_word=word_tokenize(text)
print(tokenized_word)
## Looking at word frequency
fdist = FreqDist(tokenized_word)
print(fdist.items())
print(fdist.most_common(1))  ## most common lleft to right
#print(fdist.most_common(2))  ## two most common
#print(fdist.most_common(3))  ## three most common
#print(fdist.freq("is"))  ## how frequent 
## "is" occurs once in 17 words. 1/17 = .058
fdist.N()  # freq of each
# Visualize word frequency
fdist.plot(30,cumulative=False)
plt.show()
['This', 'is', 'any', 'sentence', 'of', 'text', '.', 'It', 'can', 'have', 'punctuation', ',', 'CAPS', '!', ',', 'etc', '.']
dict_items([('This', 1), ('is', 1), ('any', 1), ('sentence', 1), ('of', 1), ('text', 1), ('.', 2), ('It', 1), ('can', 1), ('have', 1), ('punctuation', 1), (',', 2), ('CAPS', 1), ('!', 1), ('etc', 1)])
[('.', 2)]
<Figure size 640x480 with 1 Axes>
In [8]:
#---------------------------
## Stopwords - English
#------------------------------
stop_words=set(stopwords.words("english"))
#print(stop_words)
#---------------------------
## Removing Stopwords
#------------------------------
filtered_text=[]   ## Create a new empty list
for w in tokenized_word:
 #   print(w)
    if w not in stop_words:
        filtered_text.append(w)
#print("Tokenized text:",tokenized_word)
#print("Filterd text:",filtered_text)

#------------------------
# Stemming
#---------------------
ps = PorterStemmer()   ## method from nltk

stemmed_words=[]  ## make new empty list
for w in filtered_text:
    stemmed_words.append(ps.stem(w))

#print("Filtered:",filtered_text)
#print("Stemmed:",stemmed_words)

##-------------------------------
#Lemmatization reduces words to their base word
#--------------------------------------------------
# Lemmatization is usually more sophisticated than 
#stemming. Stemmer works on an individual word without
# knowledge of the context. For example, The word "better"
# has "good" as its lemma. This thing will miss by
# stemming because it requires a dictionary look-up.
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()  ## method we are using
from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()
word = "flying"
# print("Lemmatized Word:",lem.lemmatize(word,"v"))
# print("Stemmed Word:",stem.stem(word))

## ------------------------
## Part-of-Speech(POS) tagging
## ------------------------------------
# identify the grammatical group
# Whether it is a NOUN, PRONOUN, 
# ADJECTIVE, VERB, ADVERBS
#------------------------------------------
sent = "Three wonderful things in life are hiking, moutains, and COFFEE!"
Mytokens=nltk.word_tokenize(sent)
#print(Mytokens)
MyTAGS = nltk.pos_tag(Mytokens)
#print(MyTAGS)

#----------------------------------------
# Importing a Corpus
#----------------------------------------
## On my computer, there is a relative path to data
## DATA/Movie_test/neg
## and 
## DATA/Movie_test/pos
## neg and pos are folders that contain .txt
## files. neg are the negative sentiment and pos
## are positive
## We need to bring in this data into ONE
## large matrix (or dataframe) with the words
## as features (variables or columns)
## and an extra column for the label (P or N)
## This assumes import nltk
print("--------------------------------")
#print("CORPUS EXAMPLES")
#from nltk.corpus import PlaintextCorpusReader

#XXXXXXXXXXXXXXXXXXXXX
## https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
#XXXXXXXXXXXXXXXXXXXXX
#---------------
#
import sklearn
from sklearn.feature_extraction.text import CountVectorizer

#######################
## IMPORTANT - the following is an example
## of many of the "attributes" that CountVectorizer
## can use. HOWEVER !! if you do not understand them
## they make create a HUGE problem for you. 
## DO not use them blindly. 
## You can use either:
##  MyVect2=CountVectorizer(input='content')  OR
##  MyVect3=CountVectorizer(input='filename')
## with on one attribute - namely the input type. 
## 
## Below, while I show all the options, I actually 
## only use a few. I have created 4 examples of creating
## a CountVectorizer. They all follow....
#########################################################
--------------------------------
In [13]:
## Example 1
MyVectorizer1=CountVectorizer(
        input='content', ## can be set as 'content', 'file', or 'filename'
        #If set as ‘filename’, the **sequence passed as an argument to fit**
        #is expected to be a list of filenames 
        #https://scikit-learn.org/stable/modules/generated/
        ##sklearn.feature_extraction.text.CountVectorizer.html#
        ##examples-using-sklearn-feature-extraction-text-countvectorizer
        encoding='latin-1',
        decode_error='ignore', #{‘strict’, ‘ignore’, ‘replace’}
        strip_accents=None, # {‘ascii’, ‘unicode’, None}
        lowercase=True, 
        preprocessor=None, 
        tokenizer=None, 
        #stop_words='english', #string {‘english’}, list, or None (default)
        stop_words=None,
        token_pattern='(?u)\b\w\w+\b', #Regular expression denoting what constitutes a “token”
        ngram_range=(1, 1), 
        analyzer='word', 
        max_df=1.0, # ignore terms w document freq strictly > threshold 
        min_df=1, 
        max_features=None, 
        vocabulary=None, 
        binary=False, #If True, all non zero counts are set to 1
        #dtype=<class 'numpy.int64'> 
        )
#print(MyVectorizer1)
## Examples 2 and 3 for creating CountVectorizers
MyVect2=CountVectorizer(input='content')
MyVect3=CountVectorizer(input='filename')
In [14]:
MyVect2
Out[14]:
CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)
In [11]:
## Use glob to create a LIST of files in your folder
#path_file="C:\\Users\\profa\\Documents\\Python Scripts\\TextMining\\DATA\\SmallTextDocs\\Hike"
#import glob
#import os
all_file_names = []
## Update this to YOUR PATH - the location where you place SmallTextDocs
## SmallTextDocs is a folder that contains two text (.txt) files
## called Dog.txt and Hike.txt
## If you have MAC, you may or may not need both slashes \\ and/or the direction
## may be different - experiment....
path="/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs"
# path="C:\\Users\\profa\\Documents\\Python Scripts\\TextMining\\DATA\\SmallTextDocs"
#print("calling os...")
#print(os.listdir(path))
FileNameList=os.listdir(path)
#print(FileNameList)
ListOfCompleteFiles=[]
for name in os.listdir(path):
    print(path+ "/" + name)
    next=path+ "/" + name
    ListOfCompleteFiles.append(next)
#print("DONE...")
print("full list...")
print(ListOfCompleteFiles)

AllText_AllFiles=[]
for file in ListOfCompleteFiles:
    FILE=open(file)
   # print(FILE.read())
    content=FILE.read()
    AllText_AllFiles.append(content)
    FILE.close()
#print("AllText_AllFiles is....\n")
#print(AllText_AllFiles)  
#print("FIT TRANSFORM-----------------")
## FIT TRANSFORM USING a CountVect  
#print("The ListOfCompleteFiles is ...")
#print(ListOfCompleteFiles)
X3=MyVect3.fit_transform(ListOfCompleteFiles)
#print("The AllText_AllFiles is...")
#print(AllText_AllFiles)
X2=MyVect2.fit_transform(AllText_AllFiles)
#print(type(X2))
#print(type(X3))
#print(X2.get_shape())
ColumnNames2=MyVect2.get_feature_names()
ColumnNames3=MyVect3.get_feature_names()
#print("The col name for 3 ", ColumnNames3)

#import pandas as pd
#https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.html
## !!!!!!!!!! The final DTM in Python!! (this took 20 hours :)
CorpusDF_A2=pd.DataFrame(X2.toarray(),columns=ColumnNames2)
#print("The DF for 2 is...", CorpusDF_A2)
CorpusDF_A=pd.DataFrame(X3.toarray(),columns=ColumnNames3)
#print("The DF for 3 is...", CorpusDF_A)
#print("COMPLETE -------------------------------")

###-------------------------------------------------
### Creating Testing and Training Data
### from folders of txt files
### and from a single csv
###------------------------------------------------

### Here again, we will create tiny datasets
### to try and test our methods
### (1) I will create a new folder called NEG
### and another new folder called POS
### Inside of each, I will add 5 text documents
### The text docs can be very small - couple of sentences
### in the NEG will be negative text docs
### in the POS will be positive
### (2) Next, I will create a .csv with
### each row have features: sent, text, date
### We will use these two formats to create two
### seperate text/train sets. 
### !!!!!! These are two DIFFERENT examples
###-----------------------------------------------
###
### POS and NEG
###
###
### This is tricky because right now we have sep pos and neg
### but we want the train and test sets to have a balance
### of pos and neg AND we need to label them!
### One option is to read in all POS into DF1
### and add a column with P for pos
### then, read all NEG into DF2
### and add a column with N for neg
### Finally, join and shuffle DF1 and DF2
### From there, pull the test and train datasets.

### Step 1: Read in the POS files corpus into a DF1
## Example 4 for creating a CountVectorizer
print("Building Vecotrizer....")
MyVect4=CountVectorizer(input='filename',
                        analyzer = 'word',
                        stop_words='english',
                        token_pattern='(?u)[a-zA-Z]+'
                        )
path="/Users/danielcaraway/Documents/IST_736_TextMining/POS"
# path="C:\\Users\\profa\\Documents\\Python Scripts\\TextMining\\DATA\\POS"

##Create empty list
POSListOfCompleteFiles=[]
for name in os.listdir(path):
#   print(path+ "\\" + name)
    next=path+ "/" + name
    POSListOfCompleteFiles.append(next)

#print("POS full list...")
#print(POSListOfCompleteFiles)

#print("FIT with Vectorizer...")
X4=MyVect4.fit_transform(POSListOfCompleteFiles)
#print(type(X4))
#print(X4.get_shape())
POSColumnNames=MyVect4.get_feature_names()
#print("Column names: ", POSColumnNames[0:10])
#print("Building DF....")
#import pandas as pd
POS_CorpusDF_A=pd.DataFrame(X4.toarray(),columns=POSColumnNames)
#print(POS_CorpusDF_A)

## This looks good. Notice that when I built the Vectorizer
## above, that I used [a-zA-Z] which means 
## letter ONLY - no numbers.

## Now, we need to add a column for P or N. 
## I will call it PosORNeg and because all of these
## are positive, I will fill it with P
## DataFrame.insert(loc, column, value, allow_duplicates=False)
#Length=POS_CorpusDF_A.shape
#print(Length[0])  ## num of rows
#print(Length[1])  ## num of columns

## Add column
#print("Adding new column....")
POS_CorpusDF_A["PosORNeg"]="P"
#print(POS_CorpusDF_A)

## OK - now we have a labeled DF
## !!!!!!!!!!!!!!!
## To use this as a label later
## it must be type categorical
## we will get to that...
## !!!!!!!!!!!!!!!!!!

### Now - we will do tha above for
## the negative docs....
pathN="/Users/danielcaraway/Documents/IST_736_TextMining/NEG"
# pathN="C:\\Users\\profa\\Documents\\Python Scripts\\TextMining\\DATA\\NEG"
##Create empty list
NEGListOfCompleteFiles=[]
for name in os.listdir(pathN):
#   print(pathN+ "\\" + name)
    next=pathN+ "/" + name
    NEGListOfCompleteFiles.append(next)

#print("full list...")
#print(NEGListOfCompleteFiles)
X5=MyVect4.fit_transform(NEGListOfCompleteFiles)
#print(type(X5))
#print(X5.get_shape())
NEGColumnNames=MyVect4.get_feature_names()
#print(NEGColumnNames)

#import pandas as pd
NEG_CorpusDF_A=pd.DataFrame(X5.toarray(),columns=NEGColumnNames)
#print(NEG_CorpusDF_A)
NEG_CorpusDF_A["PosORNeg"]="N"
#print(NEG_CorpusDF_A)

##### GOOD!
## Now its time to join the two dataframes together
## From there, we will sample from it to get the
## test and train sets...
#######################################

## Create a new large Pos and Neg DF
## https://pandas.pydata.org/pandas-docs/stable/merging.html
result = NEG_CorpusDF_A.append(POS_CorpusDF_A)
#print(result)
## Replace the NaN with 0 because it actually 
## means none in this case
result=result.fillna(0)
#print(result)

## CREATE TEXT AND TRAIN
##
## Now that we have a complete dataframe
## with a label (in this case P or N)
## we can create a testing and training set.
## Recall that to train any model and then test
## that model (such as NB, DT, RF, SVM, etc)
## we must have DISJOINT and balanced training
## and testing data
## EACH CASE CAN BE DIFFERENT
## In this case, our "result" dataframe has all the N
## labels first and the P labels after.
## So we CANNOT grab the first X rows as the test set or 
## they will all be N (not balanced!)
##
## SHUFFLE the DATAFRAME
## df = df.sample(frac=1).reset_index(drop=True)
## Here, specifying drop=True prevents .reset_index 
## from creating a column containing the old index entries.
## the frac=1 means "resample" (shuffle) 100% of the data
result=result.sample(frac=1).reset_index(drop=True)
#print(result)
## This worked! You can see that the shape is the same
## and that the label is no longer all N and then all P

## From here, we can create (randomly) the test and train sets
# make results reproducible
import numpy as np
np.random.seed(140) ## or any number - does not matter
# sample without replacement
## I am choosing "6" here to make the training set
## of size 6. This will make the test set of size 4
## This can be any choice depending on YOU and your data
train_ix = np.random.choice(result.index, 6, replace=False)
df_training = result.iloc[train_ix]
df_test = result.drop(train_ix)
#print("Training set...")
#print(df_training)
#print("Testing set...")
#print(df_test)


########################################
###
###  READING IN ONE .csv 
###
#########################################
## In the above, we had two folders
##  - one with Pos text files and
## one with neg text files.
## We read these in as a corpus
## Created dataframes and merged
## the dataframes.
## However, this is not the only option
## for reading in text or data - there are MANY!
##
## Another common option is to have one .csv
## file that contains labels, text, and other 
## attributes.
## You may want to extract the text and labels
## and again build a labeled dataframe as well
## as test and train sets.
##
##------------------
## The following code will explore this:
##################################################
#import pandas as pd
################
/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs/Dog.txt
/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs/Hike.txt
full list...
['/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs/Dog.txt', '/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocs/Hike.txt']
Building Vecotrizer....
/usr/local/lib/python3.7/site-packages/pandas/core/frame.py:7123: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.

To retain the current behavior and silence the warning, pass 'sort=True'.

  sort=sort,
In [12]:
df_test
Out[12]:
PosORNeg abberline able ably abo aboard absinthe absolutely academy accent ... writer writihing yakov year years yellow yes yglesias york young
6 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 ... 0.0 0.0 1.0 2 0 0.0 0.0 0.0 0.0 0
7 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 ... 1.0 0.0 0.0 1 1 1.0 1.0 0.0 0.0 0
8 N 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0 ... 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0
9 P 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0 ... 0.0 0.0 0.0 2 0 0.0 0.0 0.0 0.0 0

4 rows × 1893 columns

In [15]:
result
Out[15]:
PosORNeg abberline able ably abo aboard absinthe absolutely academy accent ... writer writihing yakov year years yellow yes yglesias york young
0 P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 ... 0.0 0.0 0.0 1 0 0.0 0.0 0.0 0.0 0
1 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 ... 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 1
2 N 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0 ... 0.0 0.0 0.0 1 0 0.0 0.0 0.0 0.0 0
3 P 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 ... 0.0 0.0 0.0 0 0 0.0 0.0 0.0 1.0 0
4 P 2.0 0.0 1.0 0.0 0.0 1.0 0.0 0.0 2 ... 0.0 0.0 0.0 0 0 0.0 0.0 1.0 0.0 0
5 P 0.0 0.0 0.0 0.0 1.0 0.0 0.0 1.0 0 ... 0.0 1.0 0.0 0 1 0.0 0.0 0.0 1.0 1
6 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1 ... 0.0 0.0 1.0 2 0 0.0 0.0 0.0 0.0 0
7 N 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0 ... 1.0 0.0 0.0 1 1 1.0 1.0 0.0 0.0 0
8 N 0.0 0.0 0.0 2.0 0.0 0.0 0.0 0.0 0 ... 0.0 0.0 0.0 0 0 0.0 0.0 0.0 0.0 0
9 P 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0 ... 0.0 0.0 0.0 2 0 0.0 0.0 0.0 0.0 0

10 rows × 1893 columns

In [ ]: