#Packages and functions to be used
import os
#For regex
import re
#importing pandas
import pandas as pd
#To create a wordcloud/graphs
import numpy as np
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib import cm
from colorspacious import cspace_converter
import seaborn as sns
#Allows for randomization, you can set a seed to have reproducable results
import random
#from PIL import Image
from PIL import ImageFilter
import numpy as np
#Allows for several values for the same dictionary key
import multidict
#To get a count of words (used in the term_frequency)
from collections import Counter
#NLTK Packages
#To process text using nltk (remove stopwords, lemmatize, tokenize...)
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
#Porter stemmer
from nltk.stem.porter import PorterStemmer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
#To perform machine learning in Naive Bayes I need to import the following packages
from sklearn.model_selection import train_test_split
# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
#Multinomrial classifier for naive bayes
from sklearn.naive_bayes import MultinomialNB
#SVMs
from sklearn.svm import SVC
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
#confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
#Function 1: Creating a function to read in my files. This function will read in all the files in a specific directory.
#inputs: list of all of the file names to read
# the path where the files are located
#outputs: a list, each element in the list is the content for each file that was read in.
def reading_in_files(list_of_file_names, path):
empty_list = []
for file in list_of_file_names:
file = open(path+ "\\" + file)
data = file.read()
empty_list.append(data)
file.close()
return(empty_list)
#Function 2: Creating a function to split the output from the just_compound_and_doc function.
#inputs: the list that is to be split,
#the item that we want to split on
#outputs: a list of split lists that when put into a df, will create columns based on where the list was split
def list_split(list_to_be_split, item_to_split_on):
empty_list = []
for element in list_to_be_split:
empty_list.append(element.split(item_to_split_on))
return(empty_list)
#Function 2: Vader (from NLTK) sentiment intensity score calculator. This function will calculate the polarity score
#for each file. It will return a list of dictionaries. Each dictionary will contain the following: a compound score,
#positive score, negative score, neutral score, and the opinion.
#inputs: a list of the content to be analyzed (this is what is returned in the reading_in_files function)
#Output: a list of dictionaries. Each dictionary will contain the following: a compound score, postive score,
# negative score, neutral score, and the opinion
def sentiment_intensity_score(sentiment_files):
sent_analyze = SIA()
results = []
for word in sentiment_files:
score = sent_analyze.polarity_scores(word)
score["sentiment_file"] = word
results.append(score)
return(results)
#Function 3:
#What the function does: to be creating a list of reviews, then joining the reviews together to a string and
#getting a count for each word in the string
#Input: df and column
#Output: a dictionary with each word and the count of the word
def creating_freq_list_from_df_to_dict(df, column):
reviews = df[column].tolist()
review_string = " ".join(reviews)
review_string = review_string.split()
review_dict = Counter(review_string)
return review_dict
#Function 3:
#What the function does: creates a word cloud that is in the shape of the mask passed in
#Input: the location where the mask image is saved, the frequency word dictionary, and the max # of words to include
#and the title of the plot
def create_word_cloud_with_mask(path_of_mask_image, dictionary,
max_num_words, title):
mask = np.array(Image.open(path_of_mask_image))
#creating the word cloud
word_cloud = WordCloud(background_color = "white",
max_words = max_num_words,
mask = mask, max_font_size = 125,
random_state = 1006)
word_cloud.generate_from_frequencies(dictionary)
#creating the coloring for the word cloud
image_colors = ImageColorGenerator(mask)
plt.figure(figsize = [8,8])
plt.imshow(word_cloud.recolor(color_func = image_colors),
interpolation = "bilinear")
plt.title(title)
sns.set_context("poster")
plt.axis("off")
return plt
#Function 4:
#What the function does: creates a df with two columns: word and count of the top 12 words
#Input: the word frequency dictionary
#Output: a df with the top 12 words
def word_freq_dict_to_df_top_words(dictionary, number_of_words_wanted):
df = pd.DataFrame.from_dict(dictionary,orient='index')
df.columns = ["count"]
df["word"] = df.index
df.reset_index(drop = True, inplace = True)
df.sort_values(by=["count"], ascending = False, inplace = True)
df = df[:number_of_words_wanted]
return(df)
#Function 5:
#What the function does: creates a bar graph
#Input: the df and title of the graph
#Output: the bar graph
def top_words_bar_plot(df, title):
with sns.plotting_context("talk"):
graph = sns.barplot(y = "count", x = "word", data = df,
palette = "GnBu_d")
plt.title(title)
plt.xlabel("Word")
plt.ylabel("Count")
plt.xticks(rotation = 90)
return plt
#Function 6:
#What the function does: creates a df with two columns: word and count
#Input: the word frequency dictionary
#Output: a df
def word_freq_dict_to_df_all_words(dictionary):
df = pd.DataFrame.from_dict(dictionary,orient='index')
df.columns = ["count"]
df["word"] = df.index
df.reset_index(drop = True, inplace = True)
df.sort_values(by=["count"], ascending = False, inplace = True)
return(df)
#Function 7:
#What the function does: Returns 2 statements: One with the total number of words and the other with the number
#of unique words
#Input: the frequency count dictionary
#output: 2 statements
def total_words_unique_words(dictionary):
eda_reviews_all_words = word_freq_dict_to_df_all_words(dictionary)
print("The total number of words is", sum(eda_reviews_all_words["count"]))
print("The total number of unique words is", len(dictionary))
#Function 8:
#What the function does: It duplicates the words in each review that are in all caps.
#Input: the review to be analyzed
#Output: a new review where the first words of the review are the duplicated words from all caps and
#then the original review follows
def duplicate_all_cap_words(review):
capitalized_word = ""
for word in re.findall('([A-Z][A-Z]+\w)', review):
if word in review:
capitalized_word = capitalized_word + " " + word
new_review = capitalized_word +" " + review
return new_review
#Function 8:
def get_count_of_all_cap_words(review):
count = 0
for word in re.findall('([A-Z][A-Z]+\w)', review):
if word in review:
count += 1
return count
#Function 8:
def get_count_of_all_words(review):
count = 0
for word in re.findall('([A-z]+\w)', review):
if word in review:
count += 1
return count
#Function 9:
#What the function does: It removes all words that have less than 3 characters in it.
#Input: The string to have stopwords removed
#Ouptut: The string with the words with 2 or less characters removed
def remove_words_less_than_3_characters(string):
new_string = ""
for word in re.findall('[A-z][A-z]+\w', string):
new_string = new_string + " " + word
return new_string
#Function 10:
#What the function does: Removes stopwords
#Input: a list of stopwords to be removed, the tokenized item that you want to remove stopwords in
#Output: the same item type back with the stopwords removed.
def stop_word_removal(stopwords, item_that_you_want_to_remove_stopwords_in):
removed_stopwords = []
for word in item_that_you_want_to_remove_stopwords_in:
if word in stopwords:
continue
if word not in stopwords:
removed_stopwords.append(word)
return(removed_stopwords)
#Function11:
#What the function does: It takes the tokens from the df and joins it into a string, then replaces the "," with a space
#Input: the df and column to be changed
#Output: the data untokenized
def getting_data_ready_for_freq(df, column):
df[column] = df[column].apply(",".join)
df[column] = df[column].str.replace(",", " ")
return(df[column])
#Function 12:
#What the function does: Takes the words in a column and uses the SentimentInstensityAnalyzer from nltk and
#gets the sentiment score for every word in the column. If the word has a sentiment
#score greater than or equal to .3 (max is 1) or less than or equal to -.3 (-1 is min)
#the word is added to the keep_words list if not the word will be removed.
def pos_neg_words(column):
sia = SIA()
keep_words = []
for word in column:
if (sia.polarity_scores(word)['compound']) >= 0.005:
keep_words.append(word)
elif (sia.polarity_scores(word)['compound']) <= -0.005:
keep_words.append(word)
elif word == "not":
keep_words.append(word)
else:
continue
return keep_words
#Function 13:
#What the function does: It uses the Porter stemmer to stem each word in the column
#Input: the item that you want to be stemmed
#Output: the same item type back with the words stemmed
def stem_fun(item_that_you_want_to_be_stemmed):
stemmer = PorterStemmer()
stemmed = [stemmer.stem(token) for token in item_that_you_want_to_be_stemmed]
return(stemmed)
#Function 14:
#What the function does: It lemmatizes the data without using pos, meaning that it will not be as efficient
#Input: item to be lemmatized (the column)
#Output: the column lemmatized
def lemma_func(item_to_lemmatize):
lemmatizer = WordNetLemmatizer()
lemmatized_review = []
for token in item_to_lemmatize:
word = lemmatizer.lemmatize(token)
lemmatized_review.append(word)
return lemmatized_review
#Function 15:
#What the function does: Creates bigrams from a tokenized column in a dataframe
#Input: the column that you want to create a ngram with
#Output: a list of ngrams
def creating_ngrams(item_to_be_ngrammed, number_of_ngram):
# zip function helps generate ngrams
ngrams = zip(*[item_to_be_ngrammed[i:] for i in range(number_of_ngram)])
# Concatentate the tokens into ngrams and return
return ["_".join(ngram) for ngram in ngrams]
#Function 16:
#What the function does: Create a bag of words from a column in a df...
#Input: df and column to be transformed
#Output: A list of dictionaries for each row in the df that contains the word as a key and the count as the value
def bag_of_words(df, column_to_be_bagged):
bag_of_words = []
from collections import Counter
for word in df[column_to_be_bagged]:
bag_of_words.append(Counter(word))
return bag_of_words
#Function 17:
#What the function does: Takes the bag of words and makes it into a giant sparse matrix df, with 0s where nas are
#Input: bag of words
#Output: Giant df with the words as column names and counts as row entries
def bow_to_df(bag_of_words):
df = pd.DataFrame.from_records(bag_of_words)
df = df.fillna(0).astype(int)
return(df)
#Function 18:
#What the function does: It normalizing the df by getting the sum of each row and then dividing every entry by
#the sum, resulting in the percentage make-up of each word
#Input: dataframe to be normalized
#Output: normalized dataframe
def normalize_df(df):
names = df.columns
df["total"] = df.sum(axis = 1)
for name in names:
df[name] = df[name]/df["total"]
return(df)
#Function 19:
#What the function does: Creates a confusion matrix graph
#Input: the confusion matrix, accuracy_label, and type of df
#Output: Confusion matrix graph
def confusion_matrix_graph (cm, accuracy_label, type_of_df):
g = plt.figure(figsize=(8, 8))
g = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r', cbar = False);
g = plt.ylabel('Actual');
g = plt.xlabel('Predicted');
g = all_sample_title = type_of_df +' Accuracy Score: {0}'.format(round(accuracy_label, 4))
g = plt.title(all_sample_title, size = 12);
return(g)
#First step is importing the documents
#Getting a list of all the file names in my pos file
positive = os.listdir("C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\pos")
positive_path = "C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\pos"
#Getting a list of all the file names in my neg file
negative = os.listdir("C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\neg")
negative_path = "C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\neg"
print(positive[:10])
print(negative[:10])
positive_reviews = reading_in_files(positive, positive_path)
positive_reviews[3]
positive_reviews = list_split(positive_reviews, "**")
positive_reviews[3]
positive_df = pd.DataFrame(positive_reviews)
positive_df.head()
positive_df["review"] = positive_df[positive_df.columns[1:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)
positive_df = positive_df[[0, "review"]]
positive_df.head()
positive_df.columns = ["rating", "review"]
positive_df.head()
negative_reviews = reading_in_files(negative, negative_path)
negative_reviews[0]
negative_reviews = list_split(negative_reviews, "**")
negative_reviews[0]
negative_df = pd.DataFrame(negative_reviews)
negative_df.head()
negative_df["review"] = negative_df[negative_df.columns[1:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)
negative_df.head()
negative_df = negative_df[[0, "review"]]
negative_df.columns = ["rating", "review"]
negative_df.head()
positive_df.shape
negative_df.shape
positive_dict = creating_freq_list_from_df_to_dict(positive_df, "review")
negative_dict = creating_freq_list_from_df_to_dict(negative_df, "review")
#mask from http://clipart-library.com/clip-art/harry-potter-crest-silhouette-25.htm
#creating an array of arrays for the mask
positive_word_cloud = create_word_cloud_with_mask("sorting_hat.PNG", positive_dict, 750, "Positive Review Word Cloud Pre-Cleaning")
#mask from http://clipart-library.com/clip-art/harry-potter-crest-silhouette-25.htm
#creating an array of arrays for the mask
negative_word_cloud = create_word_cloud_with_mask("sorting_hat.PNG", negative_dict, 750, "Negative Review Word Cloud Pre-Cleaning")
#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(positive_dict, 12)
eda_reviews_top_words_pos
#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(negative_dict, 12)
eda_reviews_top_words_neg
positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 12 Positive Words Prior to Cleaning")
negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 12 Negative Words Prior to Cleaning")
print("***** Positive *****")
total_words_unique_words(positive_dict)
print("***** Negative *****")
total_words_unique_words(negative_dict)
positive_df["new_review"] = positive_df.apply(lambda row: duplicate_all_cap_words(row["review"]), axis = 1)
positive_df.head(10)
positive_df["cap_count"] = positive_df.apply(lambda row: get_count_of_all_cap_words(row["review"]), axis =1)
positive_df.head(10)
positive_df["all_words_count"] = positive_df.apply(lambda row: get_count_of_all_words(row["review"]), axis = 1)
positive_df.head(10)
positive_df["percent_cap"] = positive_df["cap_count"]/positive_df["all_words_count"]
positive_df.head(10)
positive_df.drop(["cap_count", "all_words_count"], axis = 1, inplace = True)
positive_df.head()
positive_df.tail(10)
negative_df["new_review"] = negative_df.apply(lambda row: duplicate_all_cap_words(row["review"]), axis = 1)
negative_df["cap_count"] = negative_df.apply(lambda row: get_count_of_all_cap_words(row["review"]), axis =1)
negative_df["all_words_count"] = negative_df.apply(lambda row: get_count_of_all_words(row["review"]), axis = 1)
negative_df.head(10)
negative_df["percent_cap"] = negative_df["cap_count"]/negative_df["all_words_count"]
negative_df.head(10)
negative_df.drop(["cap_count", "all_words_count"], axis = 1, inplace = True)
negative_df.head(10)
positive_df["new_review"] = positive_df["new_review"].str.lower()
positive_df.head(10)
negative_df["new_review"] = negative_df["new_review"].str.lower()
negative_df.head(10)
positive_df["new_review"] = positive_df["new_review"].str.replace(r"(n't)", "not")
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('m)", " am")
# 'd can mean had or would... I am going to change it to would beause I feel that is most likely the the correct use for reviews
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('d)", " would")
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('ll)", " will")
#I am removing 's, I do not feel like it will be useful for the review as it shows plural or possessive
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('s)", "")
#I want to keep the word not
positive_df.head()
negative_df["new_review"] = negative_df["new_review"].str.replace(r"(n't)", "not")
negative_df["new_review"] = negative_df["new_review"].str.replace(r"('m)", " am")
negative_df["new_review"] = negative_df["new_review"].str.replace(r"('d)", " would")
negative_df["new_review"] = negative_df["new_review"].str.replace(r"('ll)", " will")
negative_df["new_review"] = negative_df["new_review"].str.replace(r"('s)", "")
negative_df.head(10)
positive_df["new_review"] = positive_df["new_review"].str.replace(r"[^\w^\s]", " ")
positive_df["new_review"] = positive_df["new_review"].str.replace('\n'," ")
positive_df
negative_df["new_review"] = negative_df["new_review"].str.replace(r"[^\w^\s]", " ")
negative_df["new_review"] = negative_df["new_review"].str.replace('\n'," ")
negative_df.head(10)
#Removing any word that contains less than 3 characters
positive_df["new_review"] = positive_df.apply(lambda row: remove_words_less_than_3_characters(row["new_review"]), axis = 1)
negative_df["new_review"] = negative_df.apply(lambda row: remove_words_less_than_3_characters(row["new_review"]), axis = 1)
positive_df.head(10)
negative_df.head(10)
pos_reviews = positive_df["new_review"].tolist()
neg_reviews = negative_df["new_review"].tolist()
#Getting the vader sentiment score for each review
pos_sent = sentiment_intensity_score(pos_reviews)
neg_sent = sentiment_intensity_score(neg_reviews)
#Function 3: Ultimately, I am interested in the compound score that is returned in the function above. This function
#this function will return the compound score (with a positve or negative sign in front of it) and the opinion.
#inputs: the list of dictionaries that was returned from the sentiment_intensity_score function
#output: prints the score for each file and the opinion of each file.
def extract_sentiment(vader_sentiment_analysis, type_of_sentiment):
empty_list = []
for file in vader_sentiment_analysis:
empty_list.append("{}".format(file[type_of_sentiment]))
return(empty_list)
pos_compound = extract_sentiment(pos_sent, "compound")
pos_pos = extract_sentiment(pos_sent, "pos")
pos_neg = extract_sentiment(pos_sent, "neg")
pos_neu = extract_sentiment(pos_sent, "neu")
neg_compound = extract_sentiment(neg_sent, "compound")
neg_pos = extract_sentiment(neg_sent, "pos")
neg_neg = extract_sentiment(neg_sent, "neg")
neg_neu = extract_sentiment(neg_sent, "neu")
positive_df["compound_sentiment"] = pos_compound
positive_df["pos_sentiment"] = pos_pos
positive_df["neg_sentiment"] = pos_neg
positive_df["neu_sentiment"] = pos_neu
positive_df.head()
negative_df["compound_sentiment"] = neg_compound
negative_df["pos_sentiment"] = neg_pos
negative_df["neg_sentiment"] = neg_neg
negative_df["neu_sentiment"] = neg_neu
negative_df.head()
positive_df["review_tokenize"] = positive_df.apply(lambda row: nltk.word_tokenize(row["new_review"]), axis = 1)
negative_df["review_tokenize"] = negative_df.apply(lambda row: nltk.word_tokenize(row["new_review"]), axis = 1)
positive_df.head()
negative_df.head()
#Writing the pandas df to a csv file to save what I have done.
positive_df.to_csv(r'positive_hp_reviews.csv')
negative_df.to_csv(r'negative_hp_reviews.csv')
#Now, I am going to remove a selected list of stopwords from the tokenized review
stopwords = ["the", "and", "was", "that", "this"]
#Removing stopwords in the tokenized review
positive_df["stopwords_removed"] = positive_df["review_tokenize"].apply(lambda row: stop_word_removal(stopwords, row))
positive_df.head()
negative_df["stopwords_removed"] = negative_df["review_tokenize"].apply(lambda row: stop_word_removal(stopwords, row))
negative_df.head()
#Writing the pandas df to a csv file to save what I have done.
positive_df.to_csv(r'positive_hp_reviews.csv')
negative_df.to_csv(r'negative_hp_reviews.csv')
#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are.
#Visualizing the data without the stopwords
pos_viz = pd.DataFrame()
pos_viz["stopwords_removed"] = positive_df["stopwords_removed"].copy()
pos_viz["stopwords_removed"] = getting_data_ready_for_freq(pos_viz, "stopwords_removed")
stopwords_removed_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "stopwords_removed")
neg_viz = pd.DataFrame()
neg_viz["stopwords_removed"] = negative_df["stopwords_removed"].copy()
neg_viz["stopwords_removed"] = getting_data_ready_for_freq(neg_viz, "stopwords_removed")
stopwords_removed_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "stopwords_removed")
#mask from #mask from: http://clipart-library.com/clip-art/308-3086366_harry-potter-silurysd-harry-potter-clip-art-black.htm
#creating an array of arrays for the mask
positive_word_cloud = create_word_cloud_with_mask("harry_potter.PNG", stopwords_removed_dict_pos, 750, "Positive Review Word Cloud Stopwords Removed")
#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(stopwords_removed_dict_pos, 20)
eda_reviews_top_words_pos
positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 20 Positive Words")
#creating an array of arrays for the mask
negative_word_cloud = create_word_cloud_with_mask("harry_potter.PNG", stopwords_removed_dict_neg, 750, "Negative Review Word Cloud Stopwords Removed")
#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(stopwords_removed_dict_neg, 20)
eda_reviews_top_words_neg
negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 20 Negative Words")
print("***** Positive *****")
total_words_unique_words(stopwords_removed_dict_pos)
print("***** Negative *****")
total_words_unique_words(stopwords_removed_dict_neg)
#Going to remove one more set of stopwords
stopwords = ["movie", "for", "they", "with", "have", "film", "all", "you", "are", "just", "there", "one", "what", "has",
"his", "her", "your", "mine", "from", "not"]
negative_df["stopwords_removed"] = negative_df["stopwords_removed"].apply(lambda row: stop_word_removal(stopwords, row))
positive_df["stopwords_removed"] = positive_df["stopwords_removed"].apply(lambda row: stop_word_removal(stopwords, row))
positive_df.head()
negative_df.head()
#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are.
#Visualizing the data without the stopwords
pos_viz = pd.DataFrame()
pos_viz["stopwords_removed"] = positive_df["stopwords_removed"].copy()
pos_viz["stopwords_removed"] = getting_data_ready_for_freq(pos_viz, "stopwords_removed")
stopwords_removed_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "stopwords_removed")
neg_viz = pd.DataFrame()
neg_viz["stopwords_removed"] = negative_df["stopwords_removed"].copy()
neg_viz["stopwords_removed"] = getting_data_ready_for_freq(neg_viz, "stopwords_removed")
stopwords_removed_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "stopwords_removed")
#mask from: https://www.seekpng.com/ipng/u2r5w7e6r5q8e6t4_harry-potter-silhouette-clipart-harry-potter-and-the/
#creating an array of arrays for the mask
positive_word_cloud = create_word_cloud_with_mask("snape.PNG", stopwords_removed_dict_pos, 750, "Positive Review Word Cloud Additional Stopwords")
#mask from https://www.seekpng.com/ipng/u2r5w7e6r5q8e6t4_harry-potter-silhouette-clipart-harry-potter-and-the/
#creating an array of arrays for the mask
negative_word_cloud = create_word_cloud_with_mask("snape.PNG", stopwords_removed_dict_neg, 750, "Negative Review Word Cloud Additional Stopwords")
#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(stopwords_removed_dict_pos, 20)
eda_reviews_top_words_pos
positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 20 Positive Words")
#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(stopwords_removed_dict_neg, 20)
eda_reviews_top_words_neg
negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 20 Negative Words")
print("***** Positive *****")
total_words_unique_words(stopwords_removed_dict_pos)
print("***** Negative *****")
total_words_unique_words(stopwords_removed_dict_neg)
#Looking at stemming
positive_df["stemmed"] = positive_df["stopwords_removed"].apply(lambda row: stem_fun(row))
positive_df.head()
negative_df["stemmed"] = negative_df["stopwords_removed"].apply(lambda row: stem_fun(row))
negative_df.head()
#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are.
#Visualizing the data without the stopwords
pos_viz = pd.DataFrame()
pos_viz["stemmed"] = positive_df["stemmed"].copy()
pos_viz["stemmed"] = getting_data_ready_for_freq(pos_viz, "stemmed")
stemmed_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "stemmed")
neg_viz = pd.DataFrame()
neg_viz["stemmed"] = negative_df["stemmed"].copy()
neg_viz["stemmed"] = getting_data_ready_for_freq(neg_viz, "stemmed")
stemmed_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "stemmed")
#creating an array of arrays for the mask from https://www.pinclipart.com/pindetail/TxmRT_flying-wizard-by-inexorabletruth-harry-potter-flying-silhouette/
positive_word_cloud = create_word_cloud_with_mask("harry_broom.PNG", stemmed_dict_pos, 750, "Positive Review Word Cloud Stemmed")
#creating an array of arrays for the mask
negative_word_cloud = create_word_cloud_with_mask("harry_broom.PNG", stopwords_removed_dict_neg, 750, "Negative Review Word Cloud Stemmed")
#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(stemmed_dict_pos, 20)
eda_reviews_top_words_pos
positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 20 Positive Words")
#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(stemmed_dict_neg, 20)
eda_reviews_top_words_neg
negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 20 Negative Words")
print("***** Positive *****")
total_words_unique_words(stopwords_removed_dict_pos)
print("***** Negative *****")
total_words_unique_words(stopwords_removed_dict_neg)
#Reducing by sentiment instead of stemming...
#For my new feature reduction and word removal. I am removing words based on sentiment
positive_df["sentiment_reduced"] = positive_df.apply(lambda row: pos_neg_words(row["stopwords_removed"]), axis = 1)
negative_df["sentiment_reduced"] = negative_df.apply(lambda row: pos_neg_words(row["stopwords_removed"]), axis = 1)
positive_df.head()
negative_df.head()
#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are.
#Visualizing the data without the stopwords
pos_viz = pd.DataFrame()
pos_viz["sentiment_reduced"] = positive_df["sentiment_reduced"].copy()
pos_viz["sentiment_reduced"] = getting_data_ready_for_freq(pos_viz, "sentiment_reduced")
sentiment_reduced_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "sentiment_reduced")
neg_viz = pd.DataFrame()
neg_viz["sentiment_reduced"] = negative_df["sentiment_reduced"].copy()
neg_viz["sentiment_reduced"] = getting_data_ready_for_freq(neg_viz, "sentiment_reduced")
sentiment_reduced_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "sentiment_reduced")
#creating an array of arrays for the mask from http://getdrawings.com/hogwarts-silhouette-clipart
positive_word_cloud = create_word_cloud_with_mask("dobby.PNG", sentiment_reduced_dict_pos, 750, "Positive Review Word Cloud Sentiment Reduced")
#mask from http://getdrawings.com/hogwarts-silhouette-clipart
#creating an array of arrays for the mask
negative_word_cloud = create_word_cloud_with_mask("dobby.PNG", sentiment_reduced_dict_neg, 750, "Negative Review Word Cloud Sentiment Reduced")
#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(sentiment_reduced_dict_pos, 20)
eda_reviews_top_words_pos
positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 20 Positive Words")
#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(sentiment_reduced_dict_neg, 20)
eda_reviews_top_words_neg
negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 20 Negative Words")
print("***** Positive *****")
total_words_unique_words(sentiment_reduced_dict_pos)
print("***** Negative *****")
total_words_unique_words(sentiment_reduced_dict_neg)
#Exploring Bigrams
positive_df["bigrams"] = positive_df["review_tokenize"].apply(lambda row: creating_ngrams(row, 2))
negative_df["bigrams"] = negative_df["review_tokenize"].apply(lambda row: creating_ngrams(row, 2))
positive_df.head()
negative_df.head()
#Creating bigrams with the stopwords_removed df
positive_df["bigrams_stopwords_removed"] = positive_df["stopwords_removed"].apply(lambda row: creating_ngrams(row, 2))
negative_df["bigrams_stopwords_removed"] = negative_df["stopwords_removed"].apply(lambda row: creating_ngrams(row, 2))
#repeating the process for trigrams only for the review_tokenize
positive_df["fourgrams"] = positive_df["review_tokenize"].apply(lambda row: creating_ngrams(row, 4))
negative_df["fourgrams"] = negative_df["review_tokenize"].apply(lambda row: creating_ngrams(row, 4))
positive_df.head()
negative_df.head()
#Visualizing bigrams
#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are.
#Visualizing the data without the stopwords
pos_viz = pd.DataFrame()
pos_viz["bigrams"] = positive_df["bigrams"].copy()
pos_viz["bigrams"] = getting_data_ready_for_freq(pos_viz, "bigrams")
bigrams_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "bigrams")
neg_viz = pd.DataFrame()
neg_viz["bigrams"] = negative_df["bigrams"].copy()
neg_viz["bigrams"] = getting_data_ready_for_freq(neg_viz, "bigrams")
bigrams_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "bigrams")
#creating an array of arrays for the mask from http://getdrawings.com/hogwarts-silhouette-clipart
positive_word_cloud = create_word_cloud_with_mask("harry_potter.PNG", bigrams_dict_pos, 750, "Positive Review Word Cloud Bigrams")
#creating an array of arrays for the mask
negative_word_cloud = create_word_cloud_with_mask("harry_potter.PNG", bigrams_dict_neg, 750, "Negative Review Word Cloud Bigrams")
#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(bigrams_dict_pos, 20)
eda_reviews_top_words_pos
#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(bigrams_dict_neg, 20)
eda_reviews_top_words_neg
positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 20 Positive Words")
negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 20 Negative Words")
#Visualizing bigrams
#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are.
#Visualizing the data without the stopwords
pos_viz = pd.DataFrame()
pos_viz["bigrams_stopwords_removed"] = positive_df["bigrams_stopwords_removed"].copy()
pos_viz["bigrams_stopwords_removed"] = getting_data_ready_for_freq(pos_viz, "bigrams_stopwords_removed")
bigrams_stopwords_removed_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "bigrams_stopwords_removed")
neg_viz = pd.DataFrame()
neg_viz["bigrams_stopwords_removed"] = negative_df["bigrams_stopwords_removed"].copy()
neg_viz["bigrams_stopwords_removed"] = getting_data_ready_for_freq(neg_viz, "bigrams_stopwords_removed")
bigrams_stopwords_removed_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "bigrams_stopwords_removed")
positive_word_cloud = create_word_cloud_with_mask("harry_broom.PNG", bigrams_stopwords_removed_dict_pos, 750, "Positive Review Word Cloud Bigrams Stopwords Removed")
negative_word_cloud = create_word_cloud_with_mask("harry_broom.PNG", bigrams_stopwords_removed_dict_neg, 750, "Negative Review Word Cloud Bigrams Stopwords Removed")
#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(bigrams_stopwords_removed_dict_pos, 20)
eda_reviews_top_words_pos
#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(bigrams_stopwords_removed_dict_neg, 20)
eda_reviews_top_words_neg
positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 20 Positive Words")
negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 20 Negative Words")
#Looking at the four grams
#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are.
#Visualizing the data without the stopwords
pos_viz = pd.DataFrame()
pos_viz["fourgrams"] = positive_df["fourgrams"].copy()
pos_viz["fourgrams"] = getting_data_ready_for_freq(pos_viz, "fourgrams")
fourgrams_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "fourgrams")
neg_viz = pd.DataFrame()
neg_viz["fourgrams"] = negative_df["fourgrams"].copy()
neg_viz["fourgrams"] = getting_data_ready_for_freq(neg_viz, "fourgrams")
fourgrams_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "fourgrams")
positive_word_cloud = create_word_cloud_with_mask("sorting_hat.PNG", fourgrams_dict_pos, 75, "Positive Review Word Cloud Fourgrams")
negative_word_cloud = create_word_cloud_with_mask("sorting_hat.PNG", fourgrams_dict_neg, 75, "Negative Review Word Cloud Fourgrams")
#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(fourgrams_dict_pos, 20)
eda_reviews_top_words_pos
#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(fourgrams_dict_neg, 20)
eda_reviews_top_words_neg
positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 20 Positive Words")
negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 20 Negative Words")
#Adding a label to the positive reviews and negative reviews
positive_df["label"] = "pos"
negative_df["label"] = "neg"
#Combining the positive and negative df together
combined_df = pd.concat([positive_df, negative_df])
combined_df.head()
combined_df.tail()
#I need to reset the index
combined_df.reset_index(inplace = True)
#Ready to see if I can predict the review sentiment
no_stopwords_df = pd.DataFrame()
tokenized_df = pd.DataFrame()
stemmed_df = pd.DataFrame()
sentiment_reduced_df = pd.DataFrame()
no_stopwords_df["review"] = combined_df["stopwords_removed"].copy()
tokenized_df["review"] = combined_df["review_tokenize"].copy()
stemmed_df["review"] = combined_df["stemmed"].copy()
sentiment_reduced_df["review"] = combined_df["sentiment_reduced"].copy()
no_stopwords_df["label"] = combined_df["label"]
tokenized_df["label"] = combined_df["label"]
stemmed_df["label"] = combined_df["label"]
sentiment_reduced_df["label"] = combined_df["label"]
no_stopwords_df.head()
tokenized_df.head()
stemmed_df.head()
sentiment_reduced_df.head()
#Now I am going to create my bag of words for the tokenized reviews
#First, I must remove the labels that I just added...
tokenized_label = tokenized_df["label"]
tokenized_df.drop("label", axis = 1, inplace = True)
no_stopwords_label = no_stopwords_df["label"]
no_stopwords_df.drop("label", axis = 1, inplace = True)
stemmed_label = stemmed_df["label"]
stemmed_df.drop("label", axis = 1, inplace = True)
sentiment_label = sentiment_reduced_df["label"]
sentiment_reduced_df.drop("label", axis = 1, inplace = True)
tokenized_bow = bag_of_words(tokenized_df, "review")
no_stopwords_bow = bag_of_words(no_stopwords_df, "review")
stemmed_bow = bag_of_words(stemmed_df, "review")
sentiment_reduced_bow = bag_of_words(sentiment_reduced_df, "review")
tokenized_bow[3]
no_stopwords_bow[3]
stemmed_bow[3]
sentiment_reduced_bow[3]
tokenized_df = bow_to_df(tokenized_bow)
no_stopwords_df = bow_to_df(no_stopwords_bow)
stemmed_df = bow_to_df(stemmed_bow)
sentiment_reduced_df = bow_to_df(sentiment_reduced_bow)
tokenized_df.head()
no_stopwords_df.head()
stemmed_df.head()
sentiment_reduced_df.head()
#normalizing the dfs
#Now, I need to normalize the dataframe
tokenized_df = normalize_df(tokenized_df)
tokenized_df.head()
tokenized_df.shape
no_stopwords_df = normalize_df(no_stopwords_df)
no_stopwords_df.head()
stemmed_df = normalize_df(stemmed_df)
stemmed_df.head()
sentiment_reduced_df = normalize_df(sentiment_reduced_df)
sentiment_reduced_df.head()
#Now I need to remove the total column for each df
tokenized_df.drop("total", axis = 1, inplace = True)
no_stopwords_df.drop("total", axis = 1, inplace = True)
stemmed_df.drop("total", axis = 1, inplace = True)
sentiment_reduced_df.drop("total", axis = 1, inplace = True)
sentiment_reduced_df = sentiment_reduced_df.fillna(0).astype(int)
#Creating a testing and training df for the normalized dfs
tokenize_test_train = tokenized_df.copy()
no_stopwords_test_train = no_stopwords_df.copy()
stemmed_test_train = stemmed_df.copy()
sentiment_test_train = sentiment_reduced_df.copy()
test_train_label = combined_df["label"]
test_train_label
#Creating 4 df: 1: the training df with label removed, 2: the testing df with label removed, 3: the training label, 4: testing label
tokenized_train, tokenized_test, tokenized_train_label, tokenized_test_label = train_test_split(tokenize_test_train, test_train_label, test_size = .3, random_state = 9)
no_stopwords_train, no_stopwords_test, no_stopwords_train_label, no_stopwords_test_label = train_test_split(no_stopwords_test_train, test_train_label, test_size = .3, random_state = 9)
stemmed_train, stemmed_test, stemmed_train_label, stemmed_test_label = train_test_split(stemmed_test_train, test_train_label, test_size = .3, random_state = 9)
sentiment_train, sentiment_test, sentiment_train_label, sentiment_test_label = train_test_split(sentiment_test_train, test_train_label, test_size = .3, random_state = 9)
#Getting a count of positive and negative opinions in the test label
print(Counter(tokenized_test_label))
#There are roughly the same number of positive and negative reviews in the test and train set.
#Naive Bayes attempt Multinomial
clf = MultinomialNB()
clf.fit(tokenized_train, tokenized_train_label)
test_predicted = clf.predict(tokenized_test)
#Getting the accuracy for naive bayes
accuracy = accuracy_score(tokenized_test_label, test_predicted, normalize = True)
print("The accuracy is", accuracy)
cm = confusion_matrix(tokenized_test_label, test_predicted)
# confusion_matrix_graph(cm, accuracy, "NB Multinomial Tokenized")
tn, fp, fn, tp = cm.ravel()
print(cm)
print("The number of true negatives is: ", tn)
print("The number of false positives is: ", fp)
print("The number of false negatives is: ", fn)
print("The number of true positives is: ", tp)
#Naive Bayes attempt Multinomial
clf = MultinomialNB()
clf.fit(no_stopwords_train, no_stopwords_train_label)
test_predicted = clf.predict(no_stopwords_test)
#Getting the accuracy for naive bayes
accuracy = accuracy_score(no_stopwords_test_label, test_predicted, normalize = True)
print("The accuracy is", accuracy)
cm = confusion_matrix(no_stopwords_test_label, test_predicted)
# confusion_matrix_graph(cm, accuracy, "NB Multinomial No Stopwords")
tn, fp, fn, tp = cm.ravel()
print(cm)
print("The number of true negatives is: ", tn)
print("The number of false positives is: ", fp)
print("The number of false negatives is: ", fn)
print("The number of true positives is: ", tp)
#Naive Bayes attempt Multinomial
clf = MultinomialNB()
clf.fit(stemmed_train, stemmed_train_label)
test_predicted = clf.predict(stemmed_test)
#Getting the accuracy for naive bayes
accuracy = accuracy_score(stemmed_test_label, test_predicted, normalize = True)
print("The accuracy is", accuracy)
cm = confusion_matrix(stemmed_test_label, test_predicted)
# confusion_matrix_graph(cm, accuracy, "NB Multinomial Stemmed")
tn, fp, fn, tp = cm.ravel()
print(cm)
print("The number of true negatives is: ", tn)
print("The number of false positives is: ", fp)
print("The number of false negatives is: ", fn)
print("The number of true positives is: ", tp)
#Naive Bayes attempt Multinomial
clf = MultinomialNB()
clf.fit(sentiment_train, sentiment_train_label)
test_predicted = clf.predict(sentiment_test)
#Getting the accuracy for naive bayes
accuracy = accuracy_score(sentiment_test_label, test_predicted, normalize = True)
print("The accuracy is", accuracy)
cm = confusion_matrix(sentiment_test_label, test_predicted)
# confusion_matrix_graph(cm, accuracy, "NB Multinomial Sentiment")
tn, fp, fn, tp = cm.ravel()
print(cm)
print("The number of true negatives is: ", tn)
print("The number of false positives is: ", fp)
print("The number of false negatives is: ", fn)
print("The number of true positives is: ", tp)