#Packages and functions to be used
import os
#For regex
import re
#importing pandas
import pandas as pd
#To create a wordcloud/graphs
import numpy as np
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
from colorspacious import cspace_converter
import seaborn as sns
#Allows for randomization, you can set a seed to have reproducable results
import random
#from PIL import Image
from PIL import ImageFilter
import numpy as np
#Allows for several values for the same dictionary key
import multidict
#To get a count of words (used in the term_frequency)
from collections import Counter
#NLTK Packages
#To process text using nltk (remove stopwords, lemmatize, tokenize...)
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
#Porter stemmer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
#To perform machine learning in Naive Bayes I need to import the following packages
from sklearn.model_selection import train_test_split
# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
#Multinomrial classifier for naive bayes
from sklearn.naive_bayes import MultinomialNB
#SVMs
from sklearn.svm import SVC
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
#confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
#Function 1: Creating a function to read in my files. This function will read in all the files in a specific directory.
#inputs: list of all of the file names to read
# the path where the files are located
#outputs: a list, each element in the list is the content for each file that was read in.
def reading_in_files(list_of_file_names, path):
empty_list = []
for file in list_of_file_names:
file = open(path+ "\\" + file)
data = file.read()
empty_list.append(data)
file.close()
return(empty_list)
#Function 2: Creating a function to split the output from the just_compound_and_doc function.
#inputs: the list that is to be split,
#the item that we want to split on
#outputs: a list of split lists that when put into a df, will create columns based on where the list was split
def list_split(list_to_be_split, item_to_split_on):
empty_list = []
for element in list_to_be_split:
empty_list.append(element.split(item_to_split_on))
return(empty_list)
#Function 2: Vader (from NLTK) sentiment intensity score calculator. This function will calculate the polarity score
#for each file. It will return a list of dictionaries. Each dictionary will contain the following: a compound score,
#positive score, negative score, neutral score, and the opinion.
#inputs: a list of the content to be analyzed (this is what is returned in the reading_in_files function)
#Output: a list of dictionaries. Each dictionary will contain the following: a compound score, postive score,
# negative score, neutral score, and the opinion
def sentiment_intensity_score(sentiment_files):
sent_analyze = SIA()
results = []
for word in sentiment_files:
score = sent_analyze.polarity_scores(word)
score["sentiment_file"] = word
results.append(score)
return(results)
#Function 3:
#What the function does: to be creating a list of reviews, then joining the reviews together to a string and
#getting a count for each word in the string
#Input: df and column
#Output: a dictionary with each word and the count of the word
def creating_freq_list_from_df_to_dict(df, column):
reviews = df[column].tolist()
review_string = " ".join(reviews)
review_string = review_string.split()
review_dict = Counter(review_string)
return review_dict
#Function 3:
#What the function does: creates a word cloud that is in the shape of the mask passed in
#Input: the location where the mask image is saved, the frequency word dictionary, and the max # of words to include
#and the title of the plot
def create_word_cloud_with_mask(path_of_mask_image, dictionary,
max_num_words, title):
mask = np.array(Image.open(path_of_mask_image))
#creating the word cloud
word_cloud = WordCloud(background_color = "white",
max_words = max_num_words,
mask = mask, max_font_size = 125,
random_state = 1006)
word_cloud.generate_from_frequencies(dictionary)
#creating the coloring for the word cloud
image_colors = ImageColorGenerator(mask)
plt.figure(figsize = [8,8])
plt.imshow(word_cloud.recolor(color_func = image_colors),
interpolation = "bilinear")
plt.title(title)
sns.set_context("poster")
plt.axis("off")
return plt
#Function 4:
#What the function does: creates a df with two columns: word and count of the top 12 words
#Input: the word frequency dictionary
#Output: a df with the top 12 words
def word_freq_dict_to_df_top_words(dictionary, number_of_words_wanted):
df = pd.DataFrame.from_dict(dictionary,orient='index')
df.columns = ["count"]
df["word"] = df.index
df.reset_index(drop = True, inplace = True)
df.sort_values(by=["count"], ascending = False, inplace = True)
df = df[:number_of_words_wanted]
return(df)
#Function 5:
#What the function does: creates a bar graph
#Input: the df and title of the graph
#Output: the bar graph
def top_words_bar_plot(df, title):
with sns.plotting_context("talk"):
graph = sns.barplot(y = "count", x = "word", data = df,
palette = "GnBu_d")
plt.title(title)
plt.xlabel("Word")
plt.ylabel("Count")
plt.xticks(rotation = 90)
return plt
#Function 6:
#What the function does: creates a df with two columns: word and count
#Input: the word frequency dictionary
#Output: a df
def word_freq_dict_to_df_all_words(dictionary):
df = pd.DataFrame.from_dict(dictionary,orient='index')
df.columns = ["count"]
df["word"] = df.index
df.reset_index(drop = True, inplace = True)
df.sort_values(by=["count"], ascending = False, inplace = True)
return(df)
#Function 7:
#What the function does: Returns 2 statements: One with the total number of words and the other with the number
#of unique words
#Input: the frequency count dictionary
#output: 2 statements
def total_words_unique_words(dictionary):
eda_reviews_all_words = word_freq_dict_to_df_all_words(dictionary)
print("The total number of words is", sum(eda_reviews_all_words["count"]))
print("The total number of unique words is", len(dictionary))
#Function 8:
#What the function does: It duplicates the words in each review that are in all caps.
#Input: the review to be analyzed
#Output: a new review where the first words of the review are the duplicated words from all caps and
#then the original review follows
def duplicate_all_cap_words(review):
capitalized_word = ""
for word in re.findall('([A-Z][A-Z]+\w)', review):
if word in review:
capitalized_word = capitalized_word + " " + word
new_review = capitalized_word +" " + review
return new_review
#Function 9: Weighted Title
def duplicate_title(review):
new_review = review + " " + review +" " + review + " " + review
return(new_review)
#Function 8:
def get_count_of_all_cap_words(review):
count = 0
for word in re.findall('([A-Z][A-Z]+\w)', review):
if word in review:
count += 1
return count
#Function 8:
def get_count_of_all_words(review):
count = 0
for word in re.findall('([A-z]+\w)', review):
if word in review:
count += 1
return count
#Function 9:
#What the function does: It removes all words that have less than 3 characters in it.
#Input: The string to have stopwords removed
#Ouptut: The string with the words with 2 or less characters removed
def remove_words_less_than_3_characters(string):
new_string = ""
for word in re.findall('[A-z][A-z]+\w', string):
new_string = new_string + " " + word
return new_string
#Function 10:
#What the function does: Removes stopwords
#Input: a list of stopwords to be removed, the tokenized item that you want to remove stopwords in
#Output: the same item type back with the stopwords removed.
def stop_word_removal(stopwords, item_that_you_want_to_remove_stopwords_in):
removed_stopwords = []
for word in item_that_you_want_to_remove_stopwords_in:
if word in stopwords:
continue
if word not in stopwords:
removed_stopwords.append(word)
return(removed_stopwords)
#Function11:
#What the function does: It takes the tokens from the df and joins it into a string, then replaces the "," with a space
#Input: the df and column to be changed
#Output: the data untokenized
def getting_data_ready_for_freq(df, column):
df[column] = df[column].apply(",".join)
df[column] = df[column].str.replace(",", " ")
return(df[column])
#Function 12:
#What the function does: Takes the words in a column and uses the SentimentInstensityAnalyzer from nltk and
#gets the sentiment score for every word in the column. If the word has a sentiment
#score greater than or equal to .3 (max is 1) or less than or equal to -.3 (-1 is min)
#the word is added to the keep_words list if not the word will be removed.
def pos_neg_words(column):
sia = SIA()
keep_words = []
for word in column:
if (sia.polarity_scores(word)['compound']) >= 0.005:
keep_words.append(word)
elif (sia.polarity_scores(word)['compound']) <= -0.005:
keep_words.append(word)
elif word == "not":
keep_words.append(word)
else:
continue
return keep_words
#Function 13:
#What the function does: It uses the Porter stemmer to stem each word in the column
#Input: the item that you want to be stemmed
#Output: the same item type back with the words stemmed
def stem_fun(item_that_you_want_to_be_stemmed):
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in item_that_you_want_to_be_stemmed]
return(stemmed)
#Function 14:
#What the function does: It lemmatizes the data without using pos, meaning that it will not be as efficient
#Input: item to be lemmatized (the column)
#Output: the column lemmatized
def lemma_func(item_to_lemmatize):
lemmatizer = WordNetLemmatizer()
lemmatized_review = []
for token in item_to_lemmatize:
word = lemmatizer.lemmatize(token)
lemmatized_review.append(word)
return lemmatized_review
#Function 15:
#What the function does: Creates bigrams from a tokenized column in a dataframe
#Input: the column that you want to create a ngram with
#Output: a list of ngrams
def creating_ngrams(item_to_be_ngrammed, number_of_ngram):
# zip function helps generate ngrams
ngrams = zip(*[item_to_be_ngrammed[i:] for i in range(number_of_ngram)])
# Concatentate the tokens into ngrams and return
return ["_".join(ngram) for ngram in ngrams]
#Function 16:
#What the function does: Create a bag of words from a column in a df...
#Input: df and column to be transformed
#Output: A list of dictionaries for each row in the df that contains the word as a key and the count as the value
def bag_of_words(df, column_to_be_bagged):
bag_of_words = []
from collections import Counter
for word in df[column_to_be_bagged]:
bag_of_words.append(Counter(word))
return bag_of_words
#Function 17:
#What the function does: Takes the bag of words and makes it into a giant sparse matrix df, with 0s where nas are
#Input: bag of words
#Output: Giant df with the words as column names and counts as row entries
def bow_to_df(bag_of_words):
df = pd.DataFrame.from_records(bag_of_words)
df = df.fillna(0).astype(int)
return(df)
#Function 18:
#What the function does: It normalizing the df by getting the sum of each row and then dividing every entry by
#the sum, resulting in the percentage make-up of each word
#Input: dataframe to be normalized
#Output: normalized dataframe
def normalize_df(df):
names = df.columns
df["total"] = df.sum(axis = 1)
for name in names:
df[name] = df[name]/df["total"]
return(df)
#Function 19:
#What the function does: Creates a confusion matrix graph
#Input: the confusion matrix, accuracy_label, and type of df
#Output: Confusion matrix graph
def confusion_matrix_graph (cm, accuracy_label, type_of_df):
g = plt.figure(figsize=(8, 8))
g = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r', cbar = False);
g = plt.ylabel('Actual');
g = plt.xlabel('Predicted');
g = all_sample_title = type_of_df +' Accuracy Score: {0}'.format(round(accuracy_label, 4))
g = plt.title(all_sample_title, size = 12);
return(g)
#First step is importing the documents
#Getting a list of all the file names in my pos file
positive = os.listdir("C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\pos")
positive_path = "C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\pos"
#Getting a list of all the file names in my neg file
negative = os.listdir("C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\neg")
negative_path = "C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\neg"
print(positive[:10])
print(negative[:10])
positive_reviews = reading_in_files(positive, positive_path)
positive_reviews[3]
positive_reviews = list_split(positive_reviews, "**")
positive_reviews[3]
positive_df = pd.DataFrame(positive_reviews)
positive_df
positive_df["review"] = positive_df[positive_df.columns[1:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)
positive_title = positive_df["review"].tolist()
positive_title[3]
positive_title = list_split(positive_title, "==")
positive_df = pd.DataFrame(positive_title)
positive_df.head()
positive_df.columns = ["title", "review"]
positive_df["title"] = positive_df["title"].apply(lambda row: duplicate_title(row))
positive_df
positive_df["review"] = positive_df[positive_df.columns[:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)
positive_df.drop("title", axis = 1, inplace = True)
positive_df.head()
negative_reviews = reading_in_files(negative, negative_path)
negative_reviews[40]
negative_reviews = list_split(negative_reviews, "**")
negative_reviews[0]
negative_df = pd.DataFrame(negative_reviews)
negative_df.head()
negative_df.shape
negative_df["review"] = negative_df[negative_df.columns[1:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)
negative_title = negative_df["review"].tolist()
negative_title = list_split(negative_title, "==")
negative_df = pd.DataFrame(negative_title)
negative_df.head()
negative_df.columns = ["title", "review"]
negative_df["title"] = negative_df["title"].apply(lambda row: duplicate_title(row))
negative_df["review"] = negative_df[negative_df.columns[:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)
negative_df.drop("title", axis = 1, inplace = True)
negative_df
positive_df.shape
negative_df.shape
positive_df["new_review"] = positive_df.apply(lambda row: duplicate_all_cap_words(row["review"]), axis = 1)
positive_df.head(10)
positive_df["cap_count"] = positive_df.apply(lambda row: get_count_of_all_cap_words(row["review"]), axis =1)
positive_df.head(10)
positive_df["all_words_count"] = positive_df.apply(lambda row: get_count_of_all_words(row["review"]), axis = 1)
positive_df.head(10)
positive_df["percent_cap"] = positive_df["cap_count"]/positive_df["all_words_count"]
positive_df.head(10)
positive_df.drop(["cap_count", "all_words_count"], axis = 1, inplace = True)
positive_df.head()
positive_df.tail(10)
negative_df["new_review"] = negative_df.apply(lambda row: duplicate_all_cap_words(row["review"]), axis = 1)
negative_df["cap_count"] = negative_df.apply(lambda row: get_count_of_all_cap_words(row["review"]), axis =1)
negative_df["all_words_count"] = negative_df.apply(lambda row: get_count_of_all_words(row["review"]), axis = 1)
negative_df.head(10)
negative_df["percent_cap"] = negative_df["cap_count"]/negative_df["all_words_count"]
negative_df.head(10)
negative_df.drop(["cap_count", "all_words_count"], axis = 1, inplace = True)
negative_df.head(10)
positive_df["new_review"] = positive_df["new_review"].str.lower()
positive_df.head(10)
negative_df["new_review"] = negative_df["new_review"].str.lower()
negative_df.head(10)
positive_df["new_review"] = positive_df["new_review"].str.replace(r"(n't)", "not")
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('m)", " am")
# 'd can mean had or would... I am going to change it to would beause I feel that is most likely the the correct use for reviews
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('d)", " would")
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('ll)", " will")
#I am removing 's, I do not feel like it will be useful for the review as it shows plural or possessive
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('s)", "")
#I want to keep the word not
positive_df.head()
negative_df["new_review"] = negative_df["new_review"].str.replace(r"(n't)", "not")
negative_df["new_review"] = negative_df["new_review"].str.replace(r"('m)", " am")
negative_df["new_review"] = negative_df["new_review"].str.replace(r"('d)", " would")
negative_df["new_review"] = negative_df["new_review"].str.replace(r"('ll)", " will")
negative_df["new_review"] = negative_df["new_review"].str.replace(r"('s)", "")
negative_df.head(10)
positive_df["new_review"] = positive_df["new_review"].str.replace(r"[^\w^\s]", " ")
positive_df["new_review"] = positive_df["new_review"].str.replace('\n'," ")
positive_df
negative_df["new_review"] = negative_df["new_review"].str.replace(r"[^\w^\s]", " ")
negative_df["new_review"] = negative_df["new_review"].str.replace('\n'," ")
negative_df.head(10)
#Removing any word that contains less than 3 characters
positive_df["new_review"] = positive_df.apply(lambda row: remove_words_less_than_3_characters(row["new_review"]), axis = 1)
negative_df["new_review"] = negative_df.apply(lambda row: remove_words_less_than_3_characters(row["new_review"]), axis = 1)
positive_df.head(10)
negative_df.head(10)
positive_df["review_tokenize"] = positive_df.apply(lambda row: nltk.word_tokenize(row["new_review"]), axis = 1)
negative_df["review_tokenize"] = negative_df.apply(lambda row: nltk.word_tokenize(row["new_review"]), axis = 1)
positive_df.head()
negative_df.head()
#Writing the pandas df to a csv file to save what I have done.
positive_df.to_csv(r'positive_hp_reviews.csv')
negative_df.to_csv(r'negative_hp_reviews.csv')
#Now, I am going to remove a selected list of stopwords from the tokenized review
stopwords = ["the", "and", "was", "that", "this"]
#Removing stopwords in the tokenized review
positive_df["stopwords_removed"] = positive_df["review_tokenize"].apply(lambda row: stop_word_removal(stopwords, row))
positive_df.head()
negative_df["stopwords_removed"] = negative_df["review_tokenize"].apply(lambda row: stop_word_removal(stopwords, row))
negative_df.head()
#Writing the pandas df to a csv file to save what I have done.
positive_df.to_csv(r'positive_hp_reviews.csv')
negative_df.to_csv(r'negative_hp_reviews.csv')
#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are.
#Visualizing the data without the stopwords
pos_viz = pd.DataFrame()
pos_viz["stopwords_removed"] = positive_df["stopwords_removed"].copy()
pos_viz["stopwords_removed"] = getting_data_ready_for_freq(pos_viz, "stopwords_removed")
stopwords_removed_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "stopwords_removed")
neg_viz = pd.DataFrame()
neg_viz["stopwords_removed"] = negative_df["stopwords_removed"].copy()
neg_viz["stopwords_removed"] = getting_data_ready_for_freq(neg_viz, "stopwords_removed")
stopwords_removed_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "stopwords_removed")
#Going to remove one more set of stopwords
stopwords = ["movie", "for", "they", "with", "have", "film", "all", "you", "are", "just", "there", "one", "what", "has",
"his", "her", "your", "mine", "from", "not", "but", "like", "harry", "will", "good", "time", "will", "really", "story", "who"]
negative_df["stopwords_removed"] = negative_df["stopwords_removed"].apply(lambda row: stop_word_removal(stopwords, row))
positive_df["stopwords_removed"] = positive_df["stopwords_removed"].apply(lambda row: stop_word_removal(stopwords, row))
positive_df.head()
negative_df.head()
#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are.
#Visualizing the data without the stopwords
pos_viz = pd.DataFrame()
pos_viz["stopwords_removed"] = positive_df["stopwords_removed"].copy()
pos_viz["stopwords_removed"] = getting_data_ready_for_freq(pos_viz, "stopwords_removed")
stopwords_removed_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "stopwords_removed")
neg_viz = pd.DataFrame()
neg_viz["stopwords_removed"] = negative_df["stopwords_removed"].copy()
neg_viz["stopwords_removed"] = getting_data_ready_for_freq(neg_viz, "stopwords_removed")
stopwords_removed_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "stopwords_removed")
#mask from: https://www.seekpng.com/ipng/u2r5w7e6r5q8e6t4_harry-potter-silhouette-clipart-harry-potter-and-the/
#creating an array of arrays for the mask
positive_word_cloud = create_word_cloud_with_mask("snape.PNG", stopwords_removed_dict_pos, 750, "Positive Review Word Cloud")
#mask from https://www.seekpng.com/ipng/u2r5w7e6r5q8e6t4_harry-potter-silhouette-clipart-harry-potter-and-the/
#creating an array of arrays for the mask
negative_word_cloud = create_word_cloud_with_mask("snape.PNG", stopwords_removed_dict_neg, 750, "Negative Review Word Cloud")
#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(stopwords_removed_dict_pos, 20)
eda_reviews_top_words_pos
positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 20 Positive Words")
#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(stopwords_removed_dict_neg, 20)
eda_reviews_top_words_neg
negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 20 Negative Words")
print("***** Positive *****")
total_words_unique_words(stopwords_removed_dict_pos)
print("***** Negative *****")
total_words_unique_words(stopwords_removed_dict_neg)
#Adding a label to the positive reviews and negative reviews
positive_df["label"] = "pos"
negative_df["label"] = "neg"
#Combining the positive and negative df together
combined_df = pd.concat([positive_df, negative_df])
combined_df.head()
combined_df
#I need to reset the index
combined_df.reset_index(inplace = True)
#Ready to see if I can predict the review sentiment
no_stopwords_df = pd.DataFrame()
no_stopwords_df["review"] = combined_df["stopwords_removed"].copy()
no_stopwords_df["label"] = combined_df["label"]
no_stopwords_df.head()
#Now I am going to create my bag of words for the tokenized reviews
#First, I must remove the labels that I just added...
no_stopwords_label = no_stopwords_df["label"]
no_stopwords_df.drop("label", axis = 1, inplace = True)
no_stopwords_bow = bag_of_words(no_stopwords_df, "review")
no_stopwords_bow[3]
no_stopwords_df = bow_to_df(no_stopwords_bow)
no_stopwords_df.head()
no_stopwords_df = normalize_df(no_stopwords_df)
no_stopwords_df.head()
#Now I need to remove the total column for each df
no_stopwords_df.drop("total", axis = 1, inplace = True)
#Creating a testing and training df for the normalized dfs
no_stopwords_test_train = no_stopwords_df.copy()
test_train_label = combined_df["label"]
#Creating 4 df: 1: the training df with label removed, 2: the testing df with label removed, 3: the training label, 4: testing label
no_stopwords_train, no_stopwords_test, no_stopwords_train_label, no_stopwords_test_label = train_test_split(no_stopwords_test_train, test_train_label, test_size = .3, random_state = 9)
#Getting a count of positive and negative opinions in the test label
print(Counter(no_stopwords_test_label))
#There are roughly the same number of positive and negative reviews in the test and train set.
#Naive Bayes attempt Multinomial
clf = MultinomialNB()
clf.fit(no_stopwords_train, no_stopwords_train_label)
test_predicted = clf.predict(no_stopwords_test)
#Getting the accuracy for naive bayes
accuracy = accuracy_score(no_stopwords_test_label, test_predicted, normalize = True)
print("The accuracy is", accuracy)
cm = confusion_matrix(no_stopwords_test_label, test_predicted)
# confusion_matrix_graph(cm, accuracy, "NB Multinomial No Stopwords")
tn, fp, fn, tp = cm.ravel()
print(cm)
print("The number of true negatives is: ", tn)
print("The number of false positives is: ", fp)
print("The number of false negatives is: ", fn)
print("The number of true positives is: ", tp)