Harry Potter Movie Review Analysis Weighted Title¶

By Ali Ho¶

#Packages and functions to be used
import os 

#For regex 
import re

#importing pandas 
import pandas as pd 

#To create a wordcloud/graphs
import numpy as np 
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib as mpl
import matplotlib.pyplot as plt 
from matplotlib import cm 
from colorspacious import cspace_converter
import seaborn as sns
#Allows for randomization, you can set a seed to have reproducable results
import random

#from PIL import Image
from PIL import ImageFilter
import numpy as np
#Allows for several values for the same dictionary key 
import multidict

#To get a count of words (used in the term_frequency)
from collections import Counter 

#NLTK Packages 
#To process text using nltk (remove stopwords, lemmatize, tokenize...)
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer 
#Porter stemmer 
from nltk.stem.porter import PorterStemmer 

from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

#To perform machine learning in Naive Bayes I need to import the following packages 
from sklearn.model_selection import train_test_split 
# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
#Multinomrial classifier for naive bayes
from sklearn.naive_bayes import MultinomialNB
#SVMs 
from sklearn.svm import SVC
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
#confusion matrix 
from sklearn.metrics import confusion_matrix, classification_report

Functions I created that will be used in the document:¶

#Function 1: Creating a function to read in my files. This function will read in all the files in a specific directory. 
    #inputs: list of all of the file names to read 
    #        the path where the files are located 
    #outputs: a list, each element in the list is the content for each file that was read in. 
def reading_in_files(list_of_file_names, path):
    empty_list = []
    for file in list_of_file_names: 
        file = open(path+ "\\" + file)
        data = file.read() 
        empty_list.append(data)
        file.close()
    return(empty_list)

#Function 2: Creating a function to split the output from the just_compound_and_doc function. 
    #inputs: the list that is to be split,
            #the item that we want to split on 
    #outputs: a list of split lists that when put into a df, will create columns based on where the list was split
def list_split(list_to_be_split, item_to_split_on):
    empty_list = []
    for element in list_to_be_split: 
        empty_list.append(element.split(item_to_split_on))
    return(empty_list)

#Function 2: Vader (from NLTK) sentiment intensity score calculator. This function will calculate the polarity score 
#for each file. It will return a list of dictionaries. Each dictionary will contain the following: a compound score, 
#positive score, negative score, neutral score, and the opinion. 
    #inputs: a list of the content to be analyzed (this is what is returned in the reading_in_files function)
    #Output: a list of dictionaries. Each dictionary will contain the following: a compound score, postive score, 
    #        negative score, neutral score, and the opinion 
def sentiment_intensity_score(sentiment_files): 
    sent_analyze = SIA()
    results = []
    for word in sentiment_files: 
        score = sent_analyze.polarity_scores(word)
        score["sentiment_file"] = word 
        results.append(score)
    return(results)


#Function 3: 
    #What the function does: to be creating a list of reviews, then joining the reviews together to a string and 
                             #getting a count for each word in the string
    #Input: df and column 
    #Output: a dictionary with each word and the count of the word
def creating_freq_list_from_df_to_dict(df, column):
    reviews = df[column].tolist() 
    review_string = " ".join(reviews)
    review_string = review_string.split()
    review_dict = Counter(review_string)
    return review_dict

#Function 3: 
    #What the function does: creates a word cloud that is in the shape of the mask passed in
    #Input: the location where the mask image is saved, the frequency word dictionary, and the max # of words to include
            #and the title of the plot 
def create_word_cloud_with_mask(path_of_mask_image, dictionary, 
                                max_num_words, title):
        mask = np.array(Image.open(path_of_mask_image))
        #creating the word cloud 
        word_cloud = WordCloud(background_color = "white", 
                               max_words = max_num_words, 
                              mask = mask, max_font_size = 125, 
                              random_state = 1006)
        word_cloud.generate_from_frequencies(dictionary)
        #creating the coloring for the word cloud 
        image_colors = ImageColorGenerator(mask)
        plt.figure(figsize = [8,8])
        plt.imshow(word_cloud.recolor(color_func = image_colors), 
                  interpolation = "bilinear")
        plt.title(title)
        sns.set_context("poster")
        plt.axis("off")
        return plt

#Function 4: 
    #What the function does: creates a df with two columns: word and count of the top 12 words
    #Input: the word frequency dictionary 
    #Output: a df with the top 12 words 
def word_freq_dict_to_df_top_words(dictionary, number_of_words_wanted): 
    df = pd.DataFrame.from_dict(dictionary,orient='index')
    df.columns = ["count"]
    df["word"] = df.index
    df.reset_index(drop = True, inplace = True)
    df.sort_values(by=["count"], ascending = False, inplace = True)
    df = df[:number_of_words_wanted]
    return(df)

#Function 5: 
    #What the function does: creates a bar graph
    #Input: the df and title of the graph 
    #Output: the bar graph
def top_words_bar_plot(df, title): 
    with sns.plotting_context("talk"):
        graph = sns.barplot(y = "count", x = "word", data = df, 
                           palette = "GnBu_d")
        plt.title(title)
        plt.xlabel("Word")
        plt.ylabel("Count")
        plt.xticks(rotation = 90)
        return plt

#Function 6: 
    #What the function does: creates a df with two columns: word and count 
    #Input: the word frequency dictionary 
    #Output: a df
def word_freq_dict_to_df_all_words(dictionary): 
    df = pd.DataFrame.from_dict(dictionary,orient='index')
    df.columns = ["count"]
    df["word"] = df.index
    df.reset_index(drop = True, inplace = True)
    df.sort_values(by=["count"], ascending = False, inplace = True)
    return(df)
    
#Function 7: 
    #What the function does: Returns 2 statements: One with the total number of words and the other with the number 
                            #of unique words 
        #Input: the frequency count dictionary 
        #output: 2 statements 
def total_words_unique_words(dictionary): 
    eda_reviews_all_words = word_freq_dict_to_df_all_words(dictionary)
    print("The total number of words is", sum(eda_reviews_all_words["count"]))
    print("The total number of unique words is", len(dictionary)) 
    
#Function 8: 
    #What the function does: It duplicates the words in each review that are in all caps. 
    #Input: the review to be analyzed
    #Output: a new review where the first words of the review are the duplicated words from all caps and 
            #then the original review follows
def duplicate_all_cap_words(review): 
    capitalized_word = ""
    for word in re.findall('([A-Z][A-Z]+\w)', review):
        if word in review: 
            capitalized_word = capitalized_word + " " + word
    new_review = capitalized_word +" " + review
    return new_review
#Function 9: Weighted Title 
def duplicate_title(review): 
    new_review = review + " " + review +" " + review + " " + review
    return(new_review)

#Function 8: 
def get_count_of_all_cap_words(review): 
    count = 0 
    for word in re.findall('([A-Z][A-Z]+\w)', review):
        if word in review: 
            count += 1
    return count

#Function 8: 
def get_count_of_all_words(review): 
    count = 0 
    for word in re.findall('([A-z]+\w)', review):
        if word in review: 
            count += 1
    return count

#Function 9: 
    #What the function does: It removes all words that have less than 3 characters in it. 
    #Input: The string to have stopwords removed 
    #Ouptut: The string with the words with 2 or less characters removed 
def remove_words_less_than_3_characters(string):
    new_string = ""
    for word in re.findall('[A-z][A-z]+\w', string): 
        new_string = new_string + " " + word
    return new_string

#Function 10: 
    #What the function does: Removes stopwords 
    #Input: a list of stopwords to be removed, the tokenized item that you want to remove stopwords in 
    #Output: the same item type back with the stopwords removed. 
def stop_word_removal(stopwords, item_that_you_want_to_remove_stopwords_in): 
    removed_stopwords = [] 
    for word in item_that_you_want_to_remove_stopwords_in: 
        if word in stopwords: 
            continue
        if word not in stopwords: 
            removed_stopwords.append(word)
    return(removed_stopwords)

#Function11: 
    #What the function does: It takes the tokens from the df and joins it into a string, then replaces the "," with a space
    #Input: the df and column to be changed 
    #Output: the data untokenized 
def getting_data_ready_for_freq(df, column): 
    df[column] = df[column].apply(",".join)
    df[column] = df[column].str.replace(",", " ")
    return(df[column])

#Function 12: 
    #What the function does: Takes the words in a column and uses the SentimentInstensityAnalyzer from nltk and 
                             #gets the sentiment score for every word in the column. If the word has a sentiment 
                             #score greater than or equal to .3 (max is 1) or less than or equal to -.3 (-1 is min)
                             #the word is added to the keep_words list if not the word will be removed. 
def pos_neg_words(column):
    sia = SIA()
    keep_words = []

    for word in column:
        if (sia.polarity_scores(word)['compound']) >= 0.005:
            keep_words.append(word)
        elif (sia.polarity_scores(word)['compound']) <= -0.005:
            keep_words.append(word)
        elif word == "not": 
            keep_words.append(word)
        else:
            continue               
    return keep_words

#Function 13: 
    #What the function does: It uses the Porter stemmer to stem each word in the column 
    #Input: the item that you want to be stemmed 
    #Output: the same item type back with the words stemmed 
def stem_fun(item_that_you_want_to_be_stemmed):     
    stemmer = PorterStemmer() 
    stemmed = [stemmer.stem(token) for token in item_that_you_want_to_be_stemmed]
    return(stemmed)

#Function 14: 
    #What the function does: It lemmatizes the data without using pos, meaning that it will not be as efficient
    #Input: item to be lemmatized (the column)
    #Output: the column lemmatized 
def lemma_func(item_to_lemmatize):
    lemmatizer = WordNetLemmatizer()
    lemmatized_review = []
    for token in item_to_lemmatize: 
        word = lemmatizer.lemmatize(token)
        lemmatized_review.append(word)
    return lemmatized_review

#Function 15: 
    #What the function does: Creates bigrams from a tokenized column in a dataframe
    #Input: the column that you want to create a ngram with 
    #Output: a list of ngrams
def creating_ngrams(item_to_be_ngrammed, number_of_ngram):
    # zip function helps generate ngrams
    ngrams = zip(*[item_to_be_ngrammed[i:] for i in range(number_of_ngram)])
    # Concatentate the tokens into ngrams and return
    return ["_".join(ngram) for ngram in ngrams]

#Function 16: 
    #What the function does: Create a bag of words from a column in a df... 
    #Input: df and column to be transformed 
    #Output: A list of dictionaries for each row in the df that contains the word as a key and the count as the value 
def bag_of_words(df, column_to_be_bagged):
    bag_of_words = [] 
    from collections import Counter 
    for word in df[column_to_be_bagged]: 
        bag_of_words.append(Counter(word))
    return bag_of_words

#Function 17: 
    #What the function does: Takes the bag of words and makes it into a giant sparse matrix df, with 0s where nas are
    #Input: bag of words 
    #Output: Giant df with the words as column names and counts as row entries 
def bow_to_df(bag_of_words): 
    df = pd.DataFrame.from_records(bag_of_words)
    df = df.fillna(0).astype(int)
    return(df)

#Function 18: 
    #What the function does: It normalizing the df by getting the sum of each row and then dividing every entry by 
                             #the sum, resulting in the percentage make-up of each word
    #Input: dataframe to be normalized 
    #Output: normalized dataframe 
def normalize_df(df):
    names = df.columns 
    df["total"] = df.sum(axis = 1)
    for name in names: 
        df[name] = df[name]/df["total"]
    return(df)

#Function 19: 
    #What the function does: Creates a confusion matrix graph 
    #Input: the confusion matrix, accuracy_label, and type of df 
    #Output: Confusion matrix graph
def confusion_matrix_graph (cm, accuracy_label, type_of_df): 
    g = plt.figure(figsize=(8, 8))
    g = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r', cbar = False);
    g = plt.ylabel('Actual');
    g = plt.xlabel('Predicted');
    g = all_sample_title = type_of_df +' Accuracy Score: {0}'.format(round(accuracy_label, 4))
    g = plt.title(all_sample_title, size = 12);
    return(g)

#First step is importing the documents
#Getting a list of all the file names in my pos file 
positive = os.listdir("C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\pos")
positive_path = "C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\pos"

#Getting a list of all the file names in my neg file 
negative = os.listdir("C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\neg")
negative_path = "C:\\Users\\ho511\\Desktop\\IST_736\\homeworks\\week_3\\harry_potter_corpus\\neg"

looking to see if I successfully got the file names¶

print(positive[:10])

['10_hp1_0.txt', '10_hp1_1.txt', '10_hp1_10.txt', '10_hp1_11.txt', '10_hp1_12.txt', '10_hp1_13.txt', '10_hp1_14.txt', '10_hp1_15.txt', '10_hp1_16.txt', '10_hp1_18.txt']

looking at the first 10 items in the negative list to ensure I successfully extracted the file names¶

print(negative[:10])

['1_hp1_1.txt', '1_hp1_10.txt', '1_hp1_11.txt', '1_hp1_12.txt', '1_hp1_13.txt', '1_hp1_14.txt', '1_hp1_15.txt', '1_hp1_16.txt', '1_hp1_17.txt', '1_hp1_18.txt']

Using my reading_in_file function to read in all of the positive files¶

positive_reviews = reading_in_files(positive, positive_path)

Confirming that the function worked by looking at the first review¶

positive_reviews[3]

"10** Harry Potter and the Sorcerer's Stone\n== 10/10 - an unforgettable start to a fantastic film series and the career of the impeccable Emma Watson (and the other kids)"

The score is separated from the review by , the title is separated from the review on ==. It is important to note, that if I split on the , it is possible that the review will also be split. There is a high likelihood that the "**" characters are included in some of the review text as well. This will have to be addressed.¶

positive_reviews = list_split(positive_reviews, "**")

Checking to make sure that it worked.¶

positive_reviews[3]

['10',
 " Harry Potter and the Sorcerer's Stone\n== 10/10 - an unforgettable start to a fantastic film series and the career of the impeccable Emma Watson (and the other kids)"]

Changing the positive_review list to a data frame.¶

positive_df = pd.DataFrame(positive_reviews)

Looking at the first 5 rows¶

positive_df

positive_df["review"] = positive_df[positive_df.columns[1:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)

positive_title = positive_df["review"].tolist()

positive_title[3]

" Harry Potter and the Sorcerer's Stone\n== 10/10 - an unforgettable start to a fantastic film series and the career of the impeccable Emma Watson (and the other kids)"

positive_title = list_split(positive_title, "==")

positive_df = pd.DataFrame(positive_title)

positive_df.head()

positive_df.columns = ["title", "review"]

This shows that my worry about the reviews being split on ** was warranted. I need to joing the review text to one column. Positive, is the the score was successfully extracted¶

positive_df["title"] = positive_df["title"].apply(lambda row: duplicate_title(row))

positive_df

positive_df["review"] = positive_df[positive_df.columns[:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)

positive_df.drop("title", axis = 1, inplace = True)

Changing the column names¶

positive_df.head()

Repeating the process for the negative reviews¶

negative_reviews = reading_in_files(negative, negative_path)

negative_reviews[40]

['1',
 ' What a Disappointment\n== This movie is a disgrace to the series. The consistency issues in regards to the book and the first two films are readily apparent. Even to fans who have not read the books, the change in the geography of Hogwarts and how the characters dress are easily recognized as a break in continuity.The screenplay is chunked and leaves many important pieces to the overall storyline unexplained. Needless to say, the scenes do not flow very well and it leaves the non-reader confused in many places.This movie will unfortunately stick out like a sore thumb if the following films stick to the wonderfully developed world of Harry Potter.']

negative_reviews = list_split(negative_reviews, "**")

negative_reviews[0]

['1',
 ' For Kids Only, Unfortunately\n== ',
 '*SLIGHT SPOILERS',
 '* As an adult, it didn\'t appeal to me in the least. The effects were poorly done, the giant plot leap in the middle ("lots of strange things are happening-- hey, I know! it\'s definitely because the creepy teacher guy wants the thing under the trap door under the three-headed dog") threw me completely out of the story. It\'s really a kids only movie, which disappointed me, because I\'d been hearing such good things about it. And then at the end, Harry is sent back to his abusive adoptive parents. So much for changing one\'s situation.And I know that they had to cut tremendous amounts of story in order to make it only 2 1/2 hours long (which was still way too long for this movie). After waiting in line outside the theater with tickets bought a week ahead of time, I must say I was truly disappointed. I\'m even more saddened by the commercial success and the lunacy of die hard Harry Potter fans which will keep this sad and poorly-written movie in theaters much longer than it needs to be.Some books should remain books.']

negative_df = pd.DataFrame(negative_reviews)
negative_df.head()

negative_df.shape

(492, 26)

negative_df["review"] = negative_df[negative_df.columns[1:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)

negative_title = negative_df["review"].tolist()
negative_title = list_split(negative_title, "==")
negative_df = pd.DataFrame(negative_title)

negative_df.head()

negative_df.columns = ["title", "review"]

negative_df["title"] = negative_df["title"].apply(lambda row: duplicate_title(row))

negative_df["review"] = negative_df[negative_df.columns[:]].apply(lambda row: " ".join(row.dropna().astype(str)), axis = 1)
negative_df.drop("title", axis = 1, inplace = True)

negative_df

EDA for positive reviews and negative reviews¶

positive_df.shape

(492, 1)

negative_df.shape

(492, 1)

There is an equal number of positive and negative reviews in the data frames. Each data frame has 2 columns.¶

Using my creating_freq_list_from_df_to_dict function¶

As part of the cleansing of the reviews, I am going to turn everything into lowercase, however, I do not want to lose the emphasis of words written in ALL CAPS. Therefore, I created a function, that will duplicate every word in the review that is in all caps. I think that it is fair, to count words in all caps as double.¶

positive_df["new_review"] = positive_df.apply(lambda row: duplicate_all_cap_words(row["review"]), axis = 1)

positive_df.head(10)

positive_df["cap_count"] = positive_df.apply(lambda row: get_count_of_all_cap_words(row["review"]), axis =1)

positive_df.head(10)

positive_df["all_words_count"] = positive_df.apply(lambda row: get_count_of_all_words(row["review"]), axis = 1)

positive_df.head(10)

positive_df["percent_cap"] = positive_df["cap_count"]/positive_df["all_words_count"]

positive_df.head(10)

positive_df.drop(["cap_count", "all_words_count"], axis = 1, inplace = True)

positive_df.head()

positive_df.tail(10)

repeating the process for the negative reviews df¶

negative_df["new_review"] = negative_df.apply(lambda row: duplicate_all_cap_words(row["review"]), axis = 1)

negative_df["cap_count"] = negative_df.apply(lambda row: get_count_of_all_cap_words(row["review"]), axis =1)

negative_df["all_words_count"] = negative_df.apply(lambda row: get_count_of_all_words(row["review"]), axis = 1)

negative_df.head(10)

negative_df["percent_cap"] = negative_df["cap_count"]/negative_df["all_words_count"]

negative_df.head(10)

negative_df.drop(["cap_count", "all_words_count"], axis = 1, inplace = True)

negative_df.head(10)

You can see that some of the reviews contain words in all caps. Unfortunately not all of the words give insight to the sentiment of the review. For example, in review 508 the reviewer capitalized the works THIS HP7 and THE, none of which give added insight into the words. It might be interesting to get a count of the number of words that were in all caps for each review.¶

Making everything lowercase¶

positive_df["new_review"] = positive_df["new_review"].str.lower()

positive_df.head(10)

negative_df["new_review"] = negative_df["new_review"].str.lower()

negative_df.head(10)

Now, I need to clean the reviews and remove any non-alpha characters, and the new line character, and space character. First, I want to change anything that ends in n't to the word and then not¶

positive_df["new_review"] = positive_df["new_review"].str.replace(r"(n't)", "not")

positive_df["new_review"] = positive_df["new_review"].str.replace(r"('m)", " am")

# 'd can mean had or would... I am going to change it to would beause I feel that is most likely the the correct use for reviews 
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('d)", " would")

positive_df["new_review"] = positive_df["new_review"].str.replace(r"('ll)", " will")

#I am removing 's, I do not feel like it will be useful for the review as it shows plural or possessive 
positive_df["new_review"] = positive_df["new_review"].str.replace(r"('s)", "")

#I want to keep the word not

positive_df.head()

negative_df["new_review"] = negative_df["new_review"].str.replace(r"(n't)", "not")

negative_df["new_review"] = negative_df["new_review"].str.replace(r"('m)", " am")

negative_df["new_review"] = negative_df["new_review"].str.replace(r"('d)", " would")

negative_df["new_review"] = negative_df["new_review"].str.replace(r"('ll)", " will")

negative_df["new_review"] = negative_df["new_review"].str.replace(r"('s)", "")

negative_df.head(10)

Time to remove any non-alpha characters¶

positive_df["new_review"] = positive_df["new_review"].str.replace(r"[^\w^\s]", " ")

positive_df["new_review"] = positive_df["new_review"].str.replace('\n'," ")

positive_df

negative_df["new_review"] = negative_df["new_review"].str.replace(r"[^\w^\s]", " ")
negative_df["new_review"] = negative_df["new_review"].str.replace('\n'," ")

negative_df.head(10)

#Removing any word that contains less than 3 characters 
positive_df["new_review"] = positive_df.apply(lambda row: remove_words_less_than_3_characters(row["new_review"]), axis = 1)

negative_df["new_review"] = negative_df.apply(lambda row: remove_words_less_than_3_characters(row["new_review"]), axis = 1)

positive_df.head(10)

negative_df.head(10)

Need to tokenize the reviews¶

positive_df["review_tokenize"] = positive_df.apply(lambda row: nltk.word_tokenize(row["new_review"]), axis = 1)

negative_df["review_tokenize"] = negative_df.apply(lambda row: nltk.word_tokenize(row["new_review"]), axis = 1)

positive_df.head()

negative_df.head()

#Writing the pandas df to a csv file to save what I have done. 
positive_df.to_csv(r'positive_hp_reviews.csv')
negative_df.to_csv(r'negative_hp_reviews.csv')

#Now, I am going to remove a selected list of stopwords from the tokenized review
stopwords = ["the", "and", "was", "that", "this"]

#Removing stopwords in the tokenized review 
positive_df["stopwords_removed"] = positive_df["review_tokenize"].apply(lambda row: stop_word_removal(stopwords, row))

positive_df.head()

negative_df["stopwords_removed"] = negative_df["review_tokenize"].apply(lambda row: stop_word_removal(stopwords, row))

negative_df.head()

#Writing the pandas df to a csv file to save what I have done. 
positive_df.to_csv(r'positive_hp_reviews.csv')
negative_df.to_csv(r'negative_hp_reviews.csv')

#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are. 
#Visualizing the data without the stopwords 
pos_viz = pd.DataFrame()
pos_viz["stopwords_removed"] = positive_df["stopwords_removed"].copy()
pos_viz["stopwords_removed"] = getting_data_ready_for_freq(pos_viz, "stopwords_removed")
stopwords_removed_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "stopwords_removed")

neg_viz = pd.DataFrame()
neg_viz["stopwords_removed"] = negative_df["stopwords_removed"].copy()
neg_viz["stopwords_removed"] = getting_data_ready_for_freq(neg_viz, "stopwords_removed")
stopwords_removed_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "stopwords_removed")

#Going to remove one more set of stopwords
stopwords = ["movie", "for", "they", "with", "have", "film", "all", "you", "are", "just", "there", "one", "what", "has", 
            "his", "her", "your", "mine", "from", "not", "but", "like", "harry", "will", "good", "time", "will", "really", "story", "who"]
negative_df["stopwords_removed"] = negative_df["stopwords_removed"].apply(lambda row: stop_word_removal(stopwords, row))
positive_df["stopwords_removed"] = positive_df["stopwords_removed"].apply(lambda row: stop_word_removal(stopwords, row))

positive_df.head()

negative_df.head()

#Now, I am going to visualize the stopwords_removed reviews and decide on what the next processing steps are. 
#Visualizing the data without the stopwords 
pos_viz = pd.DataFrame()
pos_viz["stopwords_removed"] = positive_df["stopwords_removed"].copy()
pos_viz["stopwords_removed"] = getting_data_ready_for_freq(pos_viz, "stopwords_removed")
stopwords_removed_dict_pos = creating_freq_list_from_df_to_dict(pos_viz, "stopwords_removed")

neg_viz = pd.DataFrame()
neg_viz["stopwords_removed"] = negative_df["stopwords_removed"].copy()
neg_viz["stopwords_removed"] = getting_data_ready_for_freq(neg_viz, "stopwords_removed")
stopwords_removed_dict_neg = creating_freq_list_from_df_to_dict(neg_viz, "stopwords_removed")

#mask from: https://www.seekpng.com/ipng/u2r5w7e6r5q8e6t4_harry-potter-silhouette-clipart-harry-potter-and-the/
#creating an array of arrays for the mask 
positive_word_cloud = create_word_cloud_with_mask("snape.PNG", stopwords_removed_dict_pos, 750, "Positive Review Word Cloud")

#mask from https://www.seekpng.com/ipng/u2r5w7e6r5q8e6t4_harry-potter-silhouette-clipart-harry-potter-and-the/
#creating an array of arrays for the mask 
negative_word_cloud = create_word_cloud_with_mask("snape.PNG", stopwords_removed_dict_neg, 750, "Negative Review Word Cloud")

#Visualizing the top 12 words/characters
eda_reviews_top_words_pos = word_freq_dict_to_df_top_words(stopwords_removed_dict_pos, 20)
eda_reviews_top_words_pos

positive_bar_plot = top_words_bar_plot(eda_reviews_top_words_pos, "Top 20 Positive Words")

#Visualizing the top 12 words/characters
eda_reviews_top_words_neg = word_freq_dict_to_df_top_words(stopwords_removed_dict_neg, 20)
eda_reviews_top_words_neg

negative_bar_plot = top_words_bar_plot(eda_reviews_top_words_neg, "Top 20 Negative Words")

print("***** Positive *****")
total_words_unique_words(stopwords_removed_dict_pos)
print("***** Negative *****")
total_words_unique_words(stopwords_removed_dict_neg)

***** Positive *****
The total number of words is 88956
The total number of unique words is 9050
***** Negative *****
The total number of words is 89101
The total number of unique words is 9175

#Adding a label to the positive reviews and negative reviews 
positive_df["label"] = "pos"
negative_df["label"] = "neg"

#Combining the positive and negative df together 
combined_df = pd.concat([positive_df, negative_df])

combined_df.head()

combined_df

#I need to reset the index 
combined_df.reset_index(inplace = True)

#Ready to see if I can predict the review sentiment 
no_stopwords_df = pd.DataFrame() 
no_stopwords_df["review"] = combined_df["stopwords_removed"].copy()

no_stopwords_df["label"] = combined_df["label"]

no_stopwords_df.head()

#Now I am going to create my bag of words for the tokenized reviews 
#First, I must remove the labels that I just added... 
no_stopwords_label = no_stopwords_df["label"]
no_stopwords_df.drop("label", axis = 1, inplace = True)
no_stopwords_bow = bag_of_words(no_stopwords_df, "review")

no_stopwords_bow[3]

Counter({'potter': 4,
         'sorcerer': 4,
         'stone': 4,
         'unforgettable': 1,
         'start': 1,
         'fantastic': 1,
         'series': 1,
         'career': 1,
         'impeccable': 1,
         'emma': 1,
         'watson': 1,
         'other': 1,
         'kids': 1})

no_stopwords_df = bow_to_df(no_stopwords_bow)

no_stopwords_df.head()

no_stopwords_df = normalize_df(no_stopwords_df)

no_stopwords_df.head()

#Now I need to remove the total column for each df 
no_stopwords_df.drop("total", axis = 1, inplace = True)

#Creating a testing and training df for the normalized dfs 
no_stopwords_test_train = no_stopwords_df.copy()
test_train_label = combined_df["label"]

#Creating 4 df: 1: the training df with label removed, 2: the testing df with label removed, 3: the training label, 4: testing label
no_stopwords_train, no_stopwords_test, no_stopwords_train_label, no_stopwords_test_label = train_test_split(no_stopwords_test_train, test_train_label, test_size = .3, random_state = 9)

#Getting a count of positive and negative opinions in the test label 
print(Counter(no_stopwords_test_label))
#There are roughly the same number of positive and negative reviews in the test and train set.

Counter({'pos': 148, 'neg': 148})

#Naive Bayes attempt Multinomial 
clf = MultinomialNB()
clf.fit(no_stopwords_train, no_stopwords_train_label)
test_predicted = clf.predict(no_stopwords_test)
#Getting the accuracy for naive bayes 
accuracy = accuracy_score(no_stopwords_test_label, test_predicted, normalize = True)
print("The accuracy is", accuracy)
cm = confusion_matrix(no_stopwords_test_label, test_predicted)
# confusion_matrix_graph(cm, accuracy, "NB Multinomial No Stopwords")
tn, fp, fn, tp = cm.ravel()
print(cm)
print("The number of true negatives is: ", tn)
print("The number of false positives is: ", fp)
print("The number of false negatives is: ", fn)
print("The number of true positives is: ", tp)

The accuracy is 0.902027027027027
[[136  12]
 [ 17 131]]
The number of true negatives is:  136
The number of false positives is:  12
The number of false negatives is:  17
The number of true positives is:  131

	0	1	2	3	4	5	6	7	8	9	...	18	19	20	21	22	23	24	25	26	27
0	10	The Magic Comes To Life!\n== Once upon a time...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
1	10	Pure Magic\n== This movie is a delight for th...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
2	10	Enchantment, Trapdoor to Imaginary World.\n==...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
3	10	Harry Potter and the Sorcerer's Stone\n== 10/...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
4	10	Great Journey to the Magic World\n== I watch ...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
487	9	I'm Sorry, Hermoine and Ron Who?\n== The one ...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
488	9	A fantastic ending\n== This movie was the per...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
489	9	LEGENDARY!\n== Best cinematic universe out th...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
490	9	Good but ending changed for worse\n== I was r...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
491	9	Great conclusion, of fantastic story\n== i wa...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None

	0	1
0	The Magic Comes To Life!\n	Once upon a time (and not that long ago), in ...
1	Pure Magic\n	This movie is a delight for those of all ages...
2	Enchantment, Trapdoor to Imaginary World.\n	This is not a film, it is a trap door to a li...
3	Harry Potter and the Sorcerer's Stone\n	10/10 - an unforgettable start to a fantastic...
4	Great Journey to the Magic World\n	I watch this movie again in 2019, because i t...

	title	review
0	The Magic Comes To Life!\n The Magic Comes T...	Once upon a time (and not that long ago), in ...
1	Pure Magic\n Pure Magic\n Pure Magic\n Pur...	This movie is a delight for those of all ages...
2	Enchantment, Trapdoor to Imaginary World.\n ...	This is not a film, it is a trap door to a li...
3	Harry Potter and the Sorcerer's Stone\n Harr...	10/10 - an unforgettable start to a fantastic...
4	Great Journey to the Magic World\n Great Jou...	I watch this movie again in 2019, because i t...
...	...	...
487	I'm Sorry, Hermoine and Ron Who?\n I'm Sorry...	The one that ends it all, after 8 films (7 bo...
488	A fantastic ending\n A fantastic ending\n A...	This movie was the perfect ending to the best...
489	LEGENDARY!\n LEGENDARY!\n LEGENDARY!\n LEG...	Best cinematic universe out there! Thank you ...
490	Good but ending changed for worse\n Good but...	I was really looking forward to this movie. T...
491	Great conclusion, of fantastic story\n Great...	i was looking forward to seeing the last part...

	review
0	The Magic Comes To Life!\n The Magic Comes T...
1	Pure Magic\n Pure Magic\n Pure Magic\n Pur...
2	Enchantment, Trapdoor to Imaginary World.\n ...
3	Harry Potter and the Sorcerer's Stone\n Harr...
4	Great Journey to the Magic World\n Great Jou...

	0	1	2	3	4	5	6	7	8	9	...	16	17	18	19	20	21	22	23	24	25
0	1	For Kids Only, Unfortunately\n==	*SLIGHT SPOILERS	* As an adult, it didn't appeal to me in the l...	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
1	1	An effects-laden excuse of an adaptation\n== ...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
2	1	Hollywood's greatest shame.\n== At first I li...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
3	1	Dismal, Contrived, Ripoff and just plain dumb...	None	None	None	None	None	None	None	None	...	None	None	None	None	None	None	None	None	None	None
4	1	Great storytelling, no story\n==	SPOILERS	Before I briefly state my views, I must confes...			None	None	None	None	...	None	None	None	None	None	None	None	None	None	None

	0	1
0	For Kids Only, Unfortunately\n	SLIGHT SPOILERS As an adult, it didn't ap...
1	An effects-laden excuse of an adaptation\n	Many viewers of this film applaud its faithfu...
2	Hollywood's greatest shame.\n	At first I liked the Harry Potter series, it ...
3	Dismal, Contrived, Ripoff and just plain dumb!\n	terribly contrived and juvenile. From the ou...
4	Great storytelling, no story\n	SPOILERS Before I briefly state my views, I ...

	review
0	For Kids Only, Unfortunately\n For Kids Only...
1	An effects-laden excuse of an adaptation\n A...
2	Hollywood's greatest shame.\n Hollywood's gr...
3	Dismal, Contrived, Ripoff and just plain dumb...
4	Great storytelling, no story\n Great storyte...
...	...
487	Deathly Disappointing\n Deathly Disappointin...
488	Nothing but disappointing.\n Nothing but dis...
489	A middling series ends on a low-point.\n A m...
490	OK but not I'm not Wowed!\n OK but not I'm n...
491	Its OK, not great.\n Its OK, not great.\n I...

	review	new_review	cap_count	all_words_count	percent_cap
0	The Magic Comes To Life!\n The Magic Comes T...	The Magic Comes To Life!\n The Magic Comes ...	0	969	0.000000
1	Pure Magic\n Pure Magic\n Pure Magic\n Pur...	Pure Magic\n Pure Magic\n Pure Magic\n Pu...	0	104	0.000000
2	Enchantment, Trapdoor to Imaginary World.\n ...	IAN Enchantment, Trapdoor to Imaginary World...	1	461	0.002169
3	Harry Potter and the Sorcerer's Stone\n Harr...	Harry Potter and the Sorcerer's Stone\n Har...	0	43	0.000000
4	Great Journey to the Magic World\n Great Jou...	Great Journey to the Magic World\n Great Jo...	0	94	0.000000
5	The beginning of a magical journey\n The beg...	The beginning of a magical journey\n The be...	0	597	0.000000
6	Very Book Accurate and Enjoyable!!\n Very Bo...	VERY LOVES Very Book Accurate and Enjoyable!...	2	112	0.017857
7	Great\n Great\n Great\n Great\n It brough...	Great\n Great\n Great\n Great\n It broug...	0	17	0.000000
8	One of my favorites\n One of my favorites\n ...	JKR One of my favorites\n One of my favorit...	1	79	0.012658
9	Absolutely amazing\n Absolutely amazing\n A...	Absolutely amazing\n Absolutely amazing\n ...	0	19	0.000000

	review	new_review	percent_cap
482	Review - Harry Potter and the Deathly Hallows...	Review - Harry Potter and the Deathly Hallow...	0.000000
483	It's Official... Epic Movie, Epic Saga and Ep...	It's Official... Epic Movie, Epic Saga and E...	0.000000
484	sadly disappointing\n sadly disappointing\n ...	sadly disappointing\n sadly disappointing\n...	0.000000
485	The end is good..\n The end is good..\n The...	The end is good..\n The end is good..\n Th...	0.000000
486	They finally got it right!\n They finally go...	THIS HP7 THE They finally got it right!\n T...	0.007092
487	I'm Sorry, Hermoine and Ron Who?\n I'm Sorry...	I'm Sorry, Hermoine and Ron Who?\n I'm Sorr...	0.000000
488	A fantastic ending\n A fantastic ending\n A...	A fantastic ending\n A fantastic ending\n ...	0.000000
489	LEGENDARY!\n LEGENDARY!\n LEGENDARY!\n LEG...	LEGENDARY LEGENDARY LEGENDARY LEGENDARY LEGE...	0.333333
490	Good but ending changed for worse\n Good but...	Good but ending changed for worse\n Good bu...	0.000000
491	Great conclusion, of fantastic story\n Great...	AWESOME Great conclusion, of fantastic story...	0.005618

	review	new_review	cap_count	all_words_count
0	For Kids Only, Unfortunately\n For Kids Only...	SLIGHT SPOILERS For Kids Only, Unfortunately...	2	197
1	An effects-laden excuse of an adaptation\n A...	An effects-laden excuse of an adaptation\n ...	0	191
2	Hollywood's greatest shame.\n Hollywood's gr...	WOW HAD RINGWRAITHS ENTZ SHELOB YEAH Hollywo...	6	598
3	Dismal, Contrived, Ripoff and just plain dumb...	JRR Dismal, Contrived, Ripoff and just plain...	1	189
4	Great storytelling, no story\n Great storyte...	SPOILERS LOT Great storytelling, no story\n ...	2	612
5	It's bland like drinking warm water\n It's b...	It's bland like drinking warm water\n It's ...	0	384
6	Harry Potter and the Chamber of Commerce\n H...	Harry Potter and the Chamber of Commerce\n ...	0	326
7	its not worth the piece of paper the ticket w...	its not worth the piece of paper the ticket ...	0	159
8	My Revenge for the Non Fans!!\n My Revenge f...	AOL My Revenge for the Non Fans!!\n My Reve...	1	367
9	Terrible Terrible film\n Terrible Terrible f...	Terrible Terrible film\n Terrible Terrible ...	0	330

	review	new_review	cap_count	all_words_count	percent_cap
0	For Kids Only, Unfortunately\n For Kids Only...	SLIGHT SPOILERS For Kids Only, Unfortunately...	2	197	0.010152
1	An effects-laden excuse of an adaptation\n A...	An effects-laden excuse of an adaptation\n ...	0	191	0.000000
2	Hollywood's greatest shame.\n Hollywood's gr...	WOW HAD RINGWRAITHS ENTZ SHELOB YEAH Hollywo...	6	598	0.010033
3	Dismal, Contrived, Ripoff and just plain dumb...	JRR Dismal, Contrived, Ripoff and just plain...	1	189	0.005291
4	Great storytelling, no story\n Great storyte...	SPOILERS LOT Great storytelling, no story\n ...	2	612	0.003268
5	It's bland like drinking warm water\n It's b...	It's bland like drinking warm water\n It's ...	0	384	0.000000
6	Harry Potter and the Chamber of Commerce\n H...	Harry Potter and the Chamber of Commerce\n ...	0	326	0.000000
7	its not worth the piece of paper the ticket w...	its not worth the piece of paper the ticket ...	0	159	0.000000
8	My Revenge for the Non Fans!!\n My Revenge f...	AOL My Revenge for the Non Fans!!\n My Reve...	1	367	0.002725
9	Terrible Terrible film\n Terrible Terrible f...	Terrible Terrible film\n Terrible Terrible ...	0	330	0.000000

	review	new_review	percent_cap
0	The Magic Comes To Life!\n The Magic Comes T...	the magic comes to life the magic comes t...	0.000000
1	Pure Magic\n Pure Magic\n Pure Magic\n Pur...	pure magic pure magic pure magic pure ...	0.000000
2	Enchantment, Trapdoor to Imaginary World.\n ...	ian enchantment trapdoor to imaginary world...	0.002169
3	Harry Potter and the Sorcerer's Stone\n Harr...	harry potter and the sorcerer stone harry ...	0.000000
4	Great Journey to the Magic World\n Great Jou...	great journey to the magic world great jou...	0.000000
...	...	...	...
487	I'm Sorry, Hermoine and Ron Who?\n I'm Sorry...	i am sorry hermoine and ron who i am sor...	0.000000
488	A fantastic ending\n A fantastic ending\n A...	a fantastic ending a fantastic ending a ...	0.000000
489	LEGENDARY!\n LEGENDARY!\n LEGENDARY!\n LEG...	legendary legendary legendary legendary lege...	0.333333
490	Good but ending changed for worse\n Good but...	good but ending changed for worse good but...	0.000000
491	Great conclusion, of fantastic story\n Great...	awesome great conclusion of fantastic story...	0.005618

	review	new_review	percent_cap
0	For Kids Only, Unfortunately\n For Kids Only...	slight spoilers for kids only unfortunately...	0.010152
1	An effects-laden excuse of an adaptation\n A...	an effects laden excuse of an adaptation a...	0.000000
2	Hollywood's greatest shame.\n Hollywood's gr...	wow had ringwraiths entz shelob yeah hollywo...	0.010033
3	Dismal, Contrived, Ripoff and just plain dumb...	jrr dismal contrived ripoff and just plain...	0.005291
4	Great storytelling, no story\n Great storyte...	spoilers lot great storytelling no story ...	0.003268
5	It's bland like drinking warm water\n It's b...	it bland like drinking warm water it bland...	0.000000
6	Harry Potter and the Chamber of Commerce\n H...	harry potter and the chamber of commerce h...	0.000000
7	its not worth the piece of paper the ticket w...	its not worth the piece of paper the ticket ...	0.000000
8	My Revenge for the Non Fans!!\n My Revenge f...	aol my revenge for the non fans my reven...	0.002725
9	Terrible Terrible film\n Terrible Terrible f...	terrible terrible film terrible terrible f...	0.000000

	review	new_review	percent_cap
0	The Magic Comes To Life!\n The Magic Comes T...	the magic comes life the magic comes life the...	0.000000
1	Pure Magic\n Pure Magic\n Pure Magic\n Pur...	pure magic pure magic pure magic pure magic t...	0.000000
2	Enchantment, Trapdoor to Imaginary World.\n ...	ian enchantment trapdoor imaginary world ench...	0.002169
3	Harry Potter and the Sorcerer's Stone\n Harr...	harry potter and the sorcerer stone harry pot...	0.000000
4	Great Journey to the Magic World\n Great Jou...	great journey the magic world great journey t...	0.000000
5	The beginning of a magical journey\n The beg...	the beginning magical journey the beginning m...	0.000000
6	Very Book Accurate and Enjoyable!!\n Very Bo...	very loves very book accurate and enjoyable v...	0.017857
7	Great\n Great\n Great\n Great\n It brough...	great great great great brought out almost ev...	0.000000
8	One of my favorites\n One of my favorites\n ...	jkr one favorites one favorites one favorites...	0.012658
9	Absolutely amazing\n Absolutely amazing\n A...	absolutely amazing absolutely amazing absolut...	0.000000

	count	word
44	1211	potter
147	760	book
210	659	best
144	599	more
156	531	great
175	495	very
413	488	first
511	471	series
94	437	out
97	417	some
143	394	much
77	387	well
145	385	than
119	380	were
218	364	part
774	357	films
499	351	see
206	343	about
89	341	when
236	340	also

	count	word
108	1208	book
87	908	potter
38	645	out
10	555	were
97	514	books
41	499	would
45	486	about
200	480	read
53	460	much
80	444	more
164	441	how
79	431	even
204	419	movies
167	392	when
42	391	been
56	386	had
180	386	first
3	372	only
327	357	see
213	355	did

	review	label
0	[magic, comes, life, magic, comes, life, magic...	pos
1	[pure, magic, pure, magic, pure, magic, pure, ...	pos
2	[ian, enchantment, trapdoor, imaginary, world,...	pos
3	[potter, sorcerer, stone, potter, sorcerer, st...	pos
4	[great, journey, magic, world, great, journey,...	pos

	magic	comes	life	once	upon	long	ago	vivid	fertile	imagination	...
0	11	4	8	1	2	4	1	1	1	3	...
1	5	0	0	0	0	0	0	0	0	0	...
2	0	0	5	0	0	0	0	0	0	0	...
3	0	0	0	0	0	0	0	0	0	0	...
4	5	0	0	0	0	0	0	0	0	0	...

	magic	comes	life	once	upon	long	ago	vivid	fertile	imagination	...
0	0.020147	0.007326	0.014652	0.001832	0.003663	0.007326	0.001832	0.001832	0.001832	0.005495	...
1	0.083333	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...
2	0.000000	0.000000	0.018727	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...
3	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...
4	0.092593	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...

	magic	comes	life	once	upon	long	ago	vivid	fertile	imagination	...
0	11	4	8	1	2	4	1	1	1	3	...
1	5	0	0	0	0	0	0	0	0	0	...
2	0	0	5	0	0	0	0	0	0	0	...
3	0	0	0	0	0	0	0	0	0	0	...
4	5	0	0	0	0	0	0	0	0	0	...

	magic	comes	life	once	upon	long	ago	vivid	fertile	imagination	...
0	11	4	8	1	2	4	1	1	1	3	...
1	5	0	0	0	0	0	0	0	0	0	...
2	0	0	5	0	0	0	0	0	0	0	...
3	0	0	0	0	0	0	0	0	0	0	...
4	5	0	0	0	0	0	0	0	0	0	...