import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
import os
example_text = "Kendra loves cats. In fact, she has TEN cats. If she didn't have a house, a husband and a graduate degree in data science, she'd be a cat lady!"
tokenized_example = word_tokenize(example_text)
print(tokenized_example)
After turning the words into tokens, we can start to treat them a data For example, we can use a FREQUENCY DISTRIBUTION to count the words
fdist = FreqDist(tokenized_example)
print(fdist.items())
# We can use the same Frequency Distribution to find the most common word
print(fdist.most_common(1))
# We can use the Frequency Distribution to find the frequency of specific words
print(fdist.freq("cat"))
# And we can use the Frequency Distribution to graph our sentence by word freqency
fdist.N()
fdist.plot(30,cumulative=False)
plt.show()
Good note! "a" is what is considered to be a STOPWORD
stop_words = set(stopwords.words("english"))
print(stop_words)
filtered_text=[]
for w in tokenized_example:
if w not in stop_words:
filtered_text.append(w)
print("Tokenized text:",tokenized_example)
print("Filterd text:",filtered_text)
fdist_filtered = FreqDist(filtered_text)
fdist_filtered.plot(30,cumulative=False)
plt.show()
Oh! That's because we have "cat" AND "cats" which the computer is counting as two different words! Introducing...
ps = PorterStemmer()
stemmed_words=[]
for w in filtered_text:
stemmed_words.append(ps.stem(w))
fdist_stemmed = FreqDist(stemmed_words)
fdist_stemmed.plot(30,cumulative=False)
plt.show()
Also, Lemmatization! A side quest! Lemmatization reduces words to their base word
from nltk.stem.wordnet import WordNetLemmatizer
lem = WordNetLemmatizer()
from nltk.stem.porter import PorterStemmer
stem = PorterStemmer()
word = "carrying"
# 'v' because carrying is a verb
print("Lemmatized Word:",lem.lemmatize(word,"v"))
print("Stemmed Word:",stem.stem(word))
word = "better"
# 'a' because better is an adj
print("Lemmatized Word:",lem.lemmatize(word,"a"))
print("Stemmed Word:",stem.stem(word))
Another side quest!! This helps identify the grammatical group -- Noun, pronoun, adjective, verb, adverb etc
sent = "Kendra loves cats. In fact, she has TEN cats. If she didn't have a house, a husband and a graduate degree in data science, she'd be a cat lady!"
Mytokens=nltk.word_tokenize(sent)
MyTAGS = nltk.pos_tag(Mytokens)
print(MyTAGS)
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
all_file_names = []
path="/Users/danielcaraway/Documents/IST_736_TextMining/SmallTextDocsK"
FileNameList=os.listdir(path)
# METHOD A:
# Getting a list of filenames
ListOfCompleteFiles=[]
for name in os.listdir(path):
next=path+ "/" + name
ListOfCompleteFiles.append(next)
# METHOD B:
# Getting the text from the filenames
AllText_AllFiles=[]
for file in ListOfCompleteFiles:
f=open(file)
content=f.read()
AllText_AllFiles.append(content)
f.close()
# AllText_AllFiles[0]
# METHOD A (with filenames)
vec_filename = CountVectorizer(input='filename')
transformed_files = vec_filename.fit_transform(ListOfCompleteFiles)
files_feature_names = vec_filename.get_feature_names()
transformed_files_df = pd.DataFrame(transformed_files.toarray(),columns=files_feature_names)
transformed_files_df
# METHOD B (with all text from files)
vec_content = CountVectorizer(input='content')
transformed_text = vec_content.fit_transform(AllText_AllFiles)
content_feature_names = vec_content.get_feature_names()
transformed_text_df = pd.DataFrame(transformed_text.toarray(),columns=content_feature_names)
transformed_text_df
vec_removing_sw = CountVectorizer(input='filename',
analyzer = 'word',
stop_words='english',
token_pattern='(?u)[a-zA-Z]+')
# POSITIVE FILES
path="/Users/danielcaraway/Documents/IST_736_TextMining/AI_POS"
pos_files=[]
for name in os.listdir(path):
next=path+ "/" + name
pos_files.append(next)
pos_transformed = vec_removing_sw.fit_transform(pos_files)
pos_transformed_feature_names = vec_removing_sw.get_feature_names()
pos_transformed_df = pd.DataFrame(pos_transformed.toarray(), columns = pos_transformed_feature_names)
pos_transformed_df['PoN'] = 'p'
pos_transformed_df
# NEGATIVE FILES
path="/Users/danielcaraway/Documents/IST_736_TextMining/AI_NEG"
neg_files=[]
for name in os.listdir(path):
next=path+ "/" + name
neg_files.append(next)
neg_transformed = vec_removing_sw.fit_transform(neg_files)
neg_transformed_feature_names = vec_removing_sw.get_feature_names()
neg_transformed_df = pd.DataFrame(neg_transformed.toarray(), columns = neg_transformed_feature_names)
neg_transformed_df['PoN'] = 'n'
neg_transformed_df
combined = pos_transformed_df.append(neg_transformed_df, sort=False)
combined
combined = combined.fillna(0)
combined
from IPython.display import display, HTML
display(combined)