#Taking what we have learned from analyzing the kaggle dataset, we decided that we needed more information to try to
# predict profit. We are applying what we learned from our Kaggle analysis to our the numbers dataset.
#Ali and Kendra Final Project
#importing pandas, csv,
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statistics
#To create testing and training dfs and labels
from sklearn.model_selection import train_test_split
# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
#confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
#for pre-processing to fit all numeric data on the standard scale
from sklearn.preprocessing import StandardScaler
#for applying PCA function on training and testing sets
from sklearn.decomposition import PCA
#logistic regression
from sklearn.linear_model import LogisticRegression
#SVMs
from sklearn.svm import SVC
#For association rule mining
from apyori import apriori
#This will allow us to silence the warnings
import warnings
warnings.simplefilter("ignore")
#For the confusion matrix
import seaborn as sns
#Functions that we are going to use in our file:
#Creating a function that will change a column data type to category
def cat_fun(df, column):
df[column] = df[column].astype("category")
return(df[column])
#Creating a function that will remove anything in our df and replace it with nothing
def remove(df, column, object_to_remove):
df[column] = df[column].str.replace(object_to_remove, "")
return(df[column])
#Creating a function that will discretize our columns based on quartiles
def quartile_discretize(df, column, categories):
df[column] = pd.qcut(df[column], 4, labels = categories)
return(df[column])
#Creating a function that will merge our dfs with a left join
def left_merge_2_conditions(df1, df2, column1, column2):
df = pd.merge(df1, df2, how = "left", on=[column1, column2])
return(df)
#Creating a function that groups by, counts, creates a new column from the index, drops the index and changes the column names
def groupby_count(df, groupby_column, count_column):
new_df = pd.DataFrame(df.groupby(groupby_column)[count_column].count())
new_df.columns = ["count"]
new_df[groupby_column] = new_df.index.get_level_values(0)
new_df.reset_index(drop = True, inplace = True)
return(new_df)
#Creating a function that groups by, counts, creates a new column from the index, drops the index and changes the column names
def groupby_2_count(df, groupby_column1, groupby_column2, count_column):
new_df = pd.DataFrame(df.groupby([groupby_column1, groupby_column2 ])[count_column].count())
new_df.columns = ["count"]
new_df[groupby_column1] = new_df.index.get_level_values(0)
new_df[groupby_column2] = new_df.index.get_level_values(1)
new_df.reset_index(drop = True, inplace = True)
return(new_df)
# Going to use matplotlib for plotting...
# To create a plot we followed the following formula:
# df.plot(x-axis, y-axis, kind = type of plot, color = [(we specified colors to use here)], legend = False (we did not
# want a legend displayed), title = "Title") then we added a ylabel with plt.ylabel("Type label here") and an x label
# with plt.xlabel("type label here"). Finally, we wanted to change the direction of the xtick names from a 90 degree angle
# to no angle with plt.xticks(rotation = rotation angle desired)
def bar_graph_count(df, x_column, y_column, title):
g = df.plot(x_column, y_column, kind = "bar", legend = False, title = title)
g = plt.ylabel(y_column)
g = plt.xlabel(x_column)
return(g)
#This will calculate the exponential moving average of the columns we want
#exponential moving averages give more weight to the most recent data and less weight to older data
def exp_moving_avg(d, column_to_be_meaned):
d["exp_moving_avg"] = d[column_to_be_meaned].ewm(span=40,adjust=False).mean()
exp_moving_avg = list(d["exp_moving_avg"])
#Adding a 0 to the first entry to exp_moving_avg
exp_moving_avg = [0] + exp_moving_avg
#Removing the last entry in the list
exp_moving_avg.pop()
#Creating a column named exp_moving_avg with the results
d["exp_moving_avg"] = exp_moving_avg
return(exp_moving_avg)
#This will calculate the cumulative moving average
def cumulative_moving_avg(d):
d["moving_avg"] = d.expanding(min_periods = 1).mean()
moving_avg = list(d["moving_avg"])
#Adding a 0 to the first entry to moving avg
cumulative_moving_avg = [0] + moving_avg
#Removing the last entry in the list
cumulative_moving_avg.pop()
return(cumulative_moving_avg)
#This will get the list of all of the entries in the column that we are interested in for calculating the averages
def getting_list_of_entries(df, column_interested_in, column_to_be_meaned):
avg_people = pd.DataFrame(df.groupby([column_interested_in, "released"])[column_to_be_meaned].mean())
avg_column_scores = pd.DataFrame()
column_interested = list(df[column_interested_in].unique())
return([avg_people, column_interested])
#This will make a df for our moving averages that we are calculating
def making_df(people_df, column_interested_in, released, person, cumulative_avg, exp_avg):
df_2 = pd.DataFrame({column_interested_in: person, "released": released, "cumulative_mean": cumulative_avg, "exp_mean": exp_avg})
return(df_2)
#This includes the functions above, and will calculate the exponential and cumulative moving averages for which ever
#column we specify and return a df will the column interested in, released, cumulative_mean, exp_mean
def calculating_moving_avg(df, column_interested_in, column_to_be_meaned, ty):
people_df = pd.DataFrame()
people = getting_list_of_entries(df, column_interested_in, column_to_be_meaned)
cumulative_avg = []
avg_people = people[0]
avg_people
for person in people[1]:
d = avg_people.groupby(column_interested_in).get_group(person)
cumulative_avg = cumulative_moving_avg(d)
exp_avg = exp_moving_avg(d, column_to_be_meaned)
d.reset_index(inplace = True)
released = d["released"]
df = pd.DataFrame({column_interested_in: person, "released": released, ty+"_cumulative_mean_"+column_interested_in : cumulative_avg, ty+"_exp_mean_"+column_interested_in: exp_avg})
people_df = people_df.append(df)
return(people_df)
#Confusion Matrix Graph Function
def confusion_matrix_graph (cm, accuracy_label, type_of_df):
g = plt.figure(figsize=(2,2))
g = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r', cbar = False);
g = plt.ylabel('Actual');
g = plt.xlabel('Predicted');
g = all_sample_title = type_of_df +' Accuracy Score: {0}'.format(round(accuracy_label, 4))
g = plt.title(all_sample_title, size = 12);
return(g)
#reading in the V2_TN_reports.csv that we scraped
movies = pd.read_csv("V2_TN_reports_dates.csv", encoding = "ISO-8859-1")
movies.head()
#We are dropping the first column named Unnamed:0
movies.drop("Unnamed: 0", axis = 1, inplace = True)
movies.shape
#We have 1987 movies and 19 columns in our current df
#We are going to drop any rows if they have nas or missing values for budget
movies.dropna(inplace = True)
len(movies)
#We had 16 movies with missing values...
#Now we are going to drop any movies with 0s in budget
movies = movies[movies["ProductionBudget"] != "$0"]
len(movies)
#We did not have any movies with a 0 budget
#We are going to drop any movies with a DomesticBoxOffice of 0
movies = movies[movies["DomesticBoxOffice"] != "$0"]
len(movies)
#We had 19 movies with missing domestic box office info
#We are going to change column names to something a little more user friendly. First, we will look at the column names
movies.columns
column_names = ["creative_type", "domestic_box_office", "genre", "inflated_adj_dom_box_office", "int_box_office", "max_theaters",
"open_wkend_rev", "open_wkend_theaters", "budget", "production_method", "released", "released_ww", "year",
"year_ww", "source", "distributor", "engagements", "title", "world_wide_box_office"]
movies.columns = column_names
movies.head()
#Looking at the data type for each column in our df
movies.dtypes
# Eventually, we need to change the following to numeric:
# domestic_box_office
# inflated_adj_dom_box_office
# int_box_office
# max_theathers
# open_wkend_rev
# open_wkend_theaters
# budget
# engagements
# world_wide_box_office
# We need to change the following to category:
# creative_type
# genre
# production_method
# source
# distributor
# We need to change the following to date:
# released
# released ww
#Once we are done cleaning the data we are going to change the data types of the above questions.
#If we change them now, when we clean the df and removed rows, the old categories
#remain, and still show as possible categories.
#First we need to replace the $ and ',' in the columns to be changed to numeric
#First, creating a list of columns that we want to change to numeric
numeric_columns = ["domestic_box_office", "inflated_adj_dom_box_office", "int_box_office",
"max_theaters", "open_wkend_rev", "open_wkend_theaters", "budget", "engagements",
"world_wide_box_office"]
#We are using our remove function which takes the following arguments: df, column, item to remove
movies["domestic_box_office"] = remove(movies, "domestic_box_office", "$")
movies["domestic_box_office"] = remove(movies, "domestic_box_office", ",")
movies["inflated_adj_dom_box_office"] = remove(movies, "inflated_adj_dom_box_office", "$")
movies["inflated_adj_dom_box_office"] = remove(movies, "inflated_adj_dom_box_office", ",")
movies["int_box_office"] = remove(movies, "int_box_office", "$")
movies["int_box_office"] = remove(movies, "int_box_office", ",")
movies["max_theaters"] = remove(movies, "max_theaters", ",")
movies["open_wkend_theaters"] = remove(movies, "open_wkend_theaters", ",")
movies["open_wkend_rev"] = remove(movies, "open_wkend_rev", "$")
movies["open_wkend_rev"] = remove(movies, "open_wkend_rev", ",")
movies["budget"] = remove(movies, "budget", "$")
movies["budget"] = remove(movies, "budget", ",")
movies["engagements"] = remove(movies, "engagements", ",")
movies["world_wide_box_office"] = remove(movies, "world_wide_box_office", "$")
movies["world_wide_box_office"] = remove(movies, "world_wide_box_office", ",")
#Changing all of the columns in numeric_columns to numeric
movies[numeric_columns] = movies[numeric_columns].apply(pd.to_numeric)
# We need to change the following to date: released, released ww
movies["released"] = pd.to_datetime(movies["released"])
movies["released_ww"] = pd.to_datetime(movies["released_ww"])
#Separating the month, day and year into their own columns in case we would like to analyze based on month, day or year
movies["month"], movies["day"] = movies["released"].dt.month, movies["released"].dt.day
movies["month_ww"], movies["day_ww"] = movies["released_ww"].dt.month, movies["released_ww"].dt.day
#Checking data types again
movies.dtypes
#Changing the month to an ordered category
cat = list(range(1,13))
#Changing the month data type from int to ordered category
movies["month"] = pd.Categorical(movies["month"], ordered = True, categories = cat)
movies["month_ww"] = pd.Categorical(movies["month_ww"], ordered = True, categories = cat)
#Checking to see if it worked
movies.month.dtype
#Creating columns named domestic_profit, int_profit, ww_profit
#We want to be able to look at the profit for each movie... Therefore we are creating a
#profit column which is gross - budget
movies["dom_profit"] = movies["domestic_box_office"] - movies["budget"]
movies["int_profit"] = movies["int_box_office"] - movies["budget"]
movies["ww_profit"] = movies["world_wide_box_office"] - movies["budget"]
#Looking to see if that helped
movies.head()
#Creating a percent profit column to have a normalized way to compare profits.
#percent_profit = profit/budget*100
movies["dom_percent_profit"] = movies["dom_profit"]/movies["budget"]*100
movies["int_percent_profit"] = movies["int_profit"]/movies["budget"]*100
movies["ww_percent_profit"] = movies["ww_profit"]/movies["budget"]*100
#checking to see that worked
movies.head()
#Writing the clean version of the df to a csv file
#movies.to_csv("clean.csv", index = False)
# #For some reason the functions do not work without rereading in the csv file...
# movies = pd.read_csv("clean.csv", encoding = "ISO-8859-1")
#Aggregating a moving average column and calculating the mean average pp for each creative type;
#by calculating the mean pp for all creative types but for only the movies prior to the
#movie we are calculting the mean for.
dom_ct_ma = calculating_moving_avg(movies, "creative_type", "dom_percent_profit", "dom")
int_ct_ma = calculating_moving_avg(movies, "creative_type", "int_percent_profit", "int")
ww_ct_ma = calculating_moving_avg(movies, "creative_type", "ww_percent_profit", "ww")
# #Genres:
dom_genre_ma = calculating_moving_avg(movies, "genre", "dom_percent_profit", "dom")
int_genre_ma = calculating_moving_avg(movies, "genre", "int_percent_profit", "int")
ww_genre_ma = calculating_moving_avg(movies, "genre", "ww_percent_profit", "ww")
# production_method:
dom_pm_ma = calculating_moving_avg(movies, "production_method", "dom_percent_profit", "dom")
int_pm_ma = calculating_moving_avg(movies, "production_method", "int_percent_profit", "int")
ww_pm_ma = calculating_moving_avg(movies, "production_method", "ww_percent_profit", "ww")
# source
dom_source_ma = calculating_moving_avg(movies, "source", "dom_percent_profit", "dom")
int_source_ma = calculating_moving_avg(movies, "source", "int_percent_profit", "int")
ww_source_ma = calculating_moving_avg(movies, "source", "ww_percent_profit", "ww")
# distributor:
dom_distributor_ma = calculating_moving_avg(movies, "distributor", "dom_percent_profit", "dom")
int_distributor_ma = calculating_moving_avg(movies, "distributor", "int_percent_profit", "int")
ww_distributor_ma = calculating_moving_avg(movies, "distributor", "ww_percent_profit", "ww")
#Month
dom_month_ma = calculating_moving_avg(movies, "month", "dom_percent_profit", "dom")
int_month_ma = calculating_moving_avg(movies, "month", "int_percent_profit", "int")
ww_month_ma = calculating_moving_avg(movies, "month", "ww_percent_profit", "ww")
#We are going to use our left_merge_2_conditions function:
#Inputs: df1, df2, column to merge on 1 and column to merge on 2
movies = left_merge_2_conditions(movies, dom_ct_ma, "creative_type", "released")
movies = left_merge_2_conditions(movies, int_ct_ma, "creative_type", "released")
movies = left_merge_2_conditions(movies, ww_ct_ma, "creative_type", "released")
movies = left_merge_2_conditions(movies, dom_genre_ma, "genre", "released")
movies = left_merge_2_conditions(movies, int_genre_ma, "genre", "released")
movies = left_merge_2_conditions(movies, ww_genre_ma, "genre", "released")
movies = left_merge_2_conditions(movies, dom_pm_ma, "production_method", "released")
movies = left_merge_2_conditions(movies, int_pm_ma, "production_method", "released")
movies = left_merge_2_conditions(movies, ww_pm_ma, "production_method", "released")
movies = left_merge_2_conditions(movies, dom_source_ma, "source", "released")
movies = left_merge_2_conditions(movies, int_source_ma, "source", "released")
movies = left_merge_2_conditions(movies, ww_source_ma, "source", "released")
movies = left_merge_2_conditions(movies, dom_distributor_ma, "distributor", "released")
movies = left_merge_2_conditions(movies, int_distributor_ma, "distributor", "released")
movies = left_merge_2_conditions(movies, ww_distributor_ma, "distributor", "released")
movies = left_merge_2_conditions(movies, dom_month_ma, "month", "released")
movies = left_merge_2_conditions(movies, int_month_ma, "month", "released")
movies = left_merge_2_conditions(movies, ww_month_ma, "month", "released")
movies.head()
movies.shape
movies.columns
#We are removing any rows that have 0s for the newly calculated columns
#Looking to see what happens if we remove all the movies with a 0 for exp_mean_director and exp_mean_star
movies = movies[movies["dom_cumulative_mean_creative_type"] != 0]
movies = movies[movies["dom_cumulative_mean_genre"] != 0]
movies = movies[movies["dom_cumulative_mean_production_method"] != 0]
movies = movies[movies["dom_cumulative_mean_source"] != 0]
movies = movies[movies["dom_cumulative_mean_distributor"] != 0]
movies = movies[movies["dom_cumulative_mean_month"] != 0]
len(movies) #We still have 1859 movies in our df
#Changing creative_type, genre, production_method, source, distributor to category
#We are using our cat_fun which takes the following inputs: df, column to change
movies["creative_type"] = cat_fun(movies, "creative_type")
movies["genre"] = cat_fun(movies, "genre")
movies["production_method"] = cat_fun(movies, "production_method")
movies["source"] = cat_fun(movies, "source")
movies["distributor"] = cat_fun(movies, "distributor")
#What is the breakdown of genre in our df?
#Getting the count of movies for each genre in our df and saving it as a pandas df.
#We are grouping by genre and then getting the count of the genre column in each group by
#we could have used any column to get the count of...
#We are using the groupby_count function that takes the following arguments (df, groupby_column, count_column)
movies_genre = groupby_count(movies, "genre", "genre")
movies_genre
#Using our bar_graph_count function to visualize the movies_genre group
#It takes the following inputs: df, x_column, y_column, title
movies_genre.sort_values(['count'], ascending=[False], inplace = True)
bar_graph_count(movies_genre, "genre", "count", "Visualization of the Number of Movies per Genre")
#Creating a data frame of the movies creative_type count
movies_ct = groupby_count(movies, "creative_type", "creative_type")
movies_ct["creative_type"]
#Sorting the df, so the bar graph will be in descending order
movies_ct.sort_values(['count'], ascending=[False], inplace = True)
bar_graph_count(movies_ct, "creative_type", "count", "Visualization of the Number of Movies per Creative Type")
movies_year = groupby_count(movies, "year", "genre")
movies_year
bar_graph_count(movies_year, "year", "count", "Visualization of the Number of Movies per Year")
movies_month = groupby_count(movies, "month", "genre")
movies_month
bar_graph_count(movies_month, "month", "count", "Visualization of the Number of Movies per Month")
movies_source = groupby_count(movies, "source", "genre")
movies_source
movies_source.sort_values(['count'], ascending=[False], inplace = True)
bar_graph_count(movies_source, "source", "count", "Visualization of the Number of Movies per Source")
movies_distributor = groupby_count(movies, "distributor", "genre")
movies_distributor
movies_distributor = movies_distributor[movies_distributor["count"] > 0]
movies_distributor
movies_distributor.sort_values(['count'], ascending=[False], inplace = True)
bar_graph_count(movies_distributor, "distributor", "count", "Visualization of the Number of Movies per Distributor")
movies_production_method = groupby_count(movies, "production_method", "genre")
movies_production_method
movies_production_method.sort_values(['count'], ascending=[False], inplace = True)
bar_graph_count(movies_production_method, "production_method", "count", "Visualization of the Number of Movies per Production Method")
#Discretizing the df
movies_discretized = movies.copy()
#Getting a list of all of our columns
movies_discretized.columns
# We are going to descritize our data based on the quartiles. The categories are:
# extremely_low, low, high, extremely_high
# We are using our quartile_discretize function that takes the following arguments:
#(df, column, category)
categories = ["extremely_low", "low", "high", "extremely_high"]
movies_discretized["domestic_box_office"] = quartile_discretize(movies_discretized, "domestic_box_office", categories)
movies_discretized["inflated_adj_dom_box_office"] = quartile_discretize(movies_discretized, "inflated_adj_dom_box_office", categories)
movies_discretized["int_box_office"] = quartile_discretize(movies_discretized, "int_box_office", categories)
movies_discretized["max_theaters"] = quartile_discretize(movies_discretized, "max_theaters", categories)
movies_discretized["open_wkend_rev"] = quartile_discretize(movies_discretized, "open_wkend_rev", categories)
movies_discretized["open_wkend_theaters"] = quartile_discretize(movies_discretized, "open_wkend_theaters", categories)
movies_discretized["budget"] = quartile_discretize(movies_discretized, "budget", categories)
movies_discretized["engagements"] = quartile_discretize(movies_discretized, "engagements", categories)
movies_discretized["world_wide_box_office"] = quartile_discretize(movies_discretized, "world_wide_box_office", categories)
movies_discretized.columns
#The other columns that are going to be discretized contain information regarding percent profit... We have decided to
# use the following categories for percent profit
#We are creating new label categories ; Discretized Percent Profit
#We cannot use our function on this, because we are not discretizing by quartiles
categories = ["negative", "low", "high", "extremely_high"]
movies_discretized["dom_percent_profit"] = pd.cut(movies_discretized["dom_percent_profit"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_percent_profit"] = pd.cut(movies_discretized["int_percent_profit"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_percent_profit"] = pd.cut(movies_discretized["ww_percent_profit"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_cumulative_mean_creative_type"] = pd.cut(movies_discretized["dom_cumulative_mean_creative_type"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_exp_mean_creative_type"] = pd.cut(movies_discretized["dom_exp_mean_creative_type"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_cumulative_mean_creative_type"] = pd.cut(movies_discretized["int_cumulative_mean_creative_type"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_exp_mean_creative_type"] = pd.cut(movies_discretized["int_exp_mean_creative_type"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_cumulative_mean_creative_type"] = pd.cut(movies_discretized["ww_cumulative_mean_creative_type"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_exp_mean_creative_type"] = pd.cut(movies_discretized["ww_exp_mean_creative_type"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_cumulative_mean_genre"] = pd.cut(movies_discretized["dom_cumulative_mean_genre"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_exp_mean_genre"] = pd.cut(movies_discretized["dom_exp_mean_genre"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_cumulative_mean_genre"] = pd.cut(movies_discretized["int_cumulative_mean_genre"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_exp_mean_genre"] = pd.cut(movies_discretized["int_exp_mean_genre"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_cumulative_mean_genre"] = pd.cut(movies_discretized["ww_cumulative_mean_genre"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_exp_mean_genre"] = pd.cut(movies_discretized["ww_exp_mean_genre"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_cumulative_mean_production_method"] = pd.cut(movies_discretized["dom_cumulative_mean_production_method"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_exp_mean_production_method"] = pd.cut(movies_discretized["dom_exp_mean_production_method"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_cumulative_mean_production_method"] = pd.cut(movies_discretized["int_cumulative_mean_production_method"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_exp_mean_production_method"] = pd.cut(movies_discretized["int_exp_mean_production_method"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_cumulative_mean_production_method"] = pd.cut(movies_discretized["ww_cumulative_mean_production_method"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_exp_mean_production_method"] = pd.cut(movies_discretized["ww_exp_mean_production_method"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_cumulative_mean_source"] = pd.cut(movies_discretized["dom_cumulative_mean_source"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_exp_mean_source"] = pd.cut(movies_discretized["dom_exp_mean_source"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_cumulative_mean_source"] = pd.cut(movies_discretized["int_cumulative_mean_source"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_exp_mean_source"] = pd.cut(movies_discretized["int_exp_mean_source"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_cumulative_mean_source"] = pd.cut(movies_discretized["ww_cumulative_mean_source"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_exp_mean_source"] = pd.cut(movies_discretized["ww_exp_mean_source"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_cumulative_mean_distributor"] = pd.cut(movies_discretized["dom_cumulative_mean_distributor"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_exp_mean_distributor"] = pd.cut(movies_discretized["dom_exp_mean_distributor"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_cumulative_mean_distributor"] = pd.cut(movies_discretized["int_cumulative_mean_distributor"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_exp_mean_distributor"] = pd.cut(movies_discretized["int_exp_mean_distributor"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_cumulative_mean_distributor"] = pd.cut(movies_discretized["ww_cumulative_mean_distributor"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_exp_mean_distributor"] = pd.cut(movies_discretized["ww_exp_mean_distributor"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_cumulative_mean_month"] = pd.cut(movies_discretized["dom_cumulative_mean_month"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["dom_exp_mean_month"] = pd.cut(movies_discretized["dom_exp_mean_month"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_cumulative_mean_month"] = pd.cut(movies_discretized["int_cumulative_mean_month"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["int_exp_mean_month"] = pd.cut(movies_discretized["int_exp_mean_month"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_cumulative_mean_month"] = pd.cut(movies_discretized["ww_cumulative_mean_month"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized["ww_exp_mean_month"] = pd.cut(movies_discretized["ww_exp_mean_month"], [-100, 0, 50, 150, 999999], labels = categories)
movies_discretized.dom_profit.describe()
#negative = -999999999 - 0
#low = 0 - 40000000
# high = 40000000 - 79559420
# extremely_high = 79559420 - 9999999999
movies_discretized["dom_profit"] = pd.cut(movies_discretized["dom_profit"], [-999999999, 0, 40000000, 79559420, 999999999], labels = categories)
movies_discretized.int_profit.describe()
#negative = -999999999 - 0
#low = 0 - 3747336
# high = 3747336 - 49913670
# extremely_high = 49913670 - 9999999999
movies_discretized["int_profit"] = pd.cut(movies_discretized["int_profit"], [-999999999, 0, 3747336, 49913670, 9999999999], labels = categories)
movies_discretized.ww_profit.describe()
#negative = -999999999 - 0
#low = 0 - 10000000
# high = 10000000 - 303138900
# extremely_high = 303138900 - 9999999999
movies_discretized["ww_profit"] = pd.cut(movies_discretized["ww_profit"], [-999999999, 0, 10000000, 303138900, 9999999999], labels = categories)
#We are setting new categories for the day column by creating a new column for week
# week_1 is the first 7 days of the month, week_2 is days 8 - 14, week_3 is days 15 - 21, and week_4 are the
# rest of the days
categories = ["week_1", "week_2", "week_3", "week_4"]
movies_discretized["week"] = pd.cut(movies_discretized["day"], [0, 8, 15, 22, 32], labels = categories)
#Looking at the relationship between genre and percent profit
movies_discretized_genre_pp = groupby_2_count(movies_discretized, "genre", "dom_percent_profit", "genre")
movies_discretized_genre_pp
#Now we are getting the sum of each genre category... We do not have a function for sum... we could go back and rework
#our function.
movies_discretized_genre_pp.groupby("genre")["count"].sum()
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre"]
# We ultimately want a column that contains the total counts for each genre group. We are probably doing this in
# a roundabout way, but as I am extremely new to python this is the best way I can think of doing it. We are going to create
# a new column that replicates the genre column called genre_count and then we will use the replace function to
# replace the genre names with their total count
#First, replicating the income level column in a column named budget_category_count
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre"]
#Now replacing the income level with the total count for each income level
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Action"], 377 )
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Adventure"], 538)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Black Comedy"], 9)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Comedy"], 315)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Concert/Performance"], 2)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Documentary"], 2)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Drama"], 249)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Horror"], 91)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Musical"], 30)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Romantic Comedy"], 76)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Thriller/Suspense"], 158)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Western"], 12)
movies_discretized_genre_pp.head()
movies_discretized_genre_pp["genre_count"] = pd.to_numeric(movies_discretized_genre_pp["genre_count"])
#Okay, we are one step closer... Now, we need to create a column that takes the counts/genre_counts * 100
movies_discretized_genre_pp["percent"] = movies_discretized_genre_pp["count"]/movies_discretized_genre_pp["genre_count"] *100
movies_discretized_genre_pp.head()
'''Attempting to graph this data using a grouped bar chart:
formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"],
title = "you can set the title of your graph here")'''
graph = movies_discretized_genre_pp.pivot("genre", "dom_percent_profit",
"percent").plot(kind="bar", color = ["crimson", "salmon", "palegreen", "darkgreen"],
title = "Percent of Percent Profit to Genre Category")
#Changing the y label of our graph to Percent
plt.ylabel("Percent")
#Changing the x axis label of our graph to Budget Category
plt.xlabel("Genre")
#How to change the tick labels (we ended up not needing this, but want to keep for future reference)
#plt.Axes.set_xticklabels(graph, labels = ['extremely low', 'low', 'high', 'extremely high'])
#moving the legend position to underneath the graph, also setting it to have 4 columns so the legend is in a
#straight single line and adding a legend title
plt.legend( loc = "lower center", bbox_to_anchor = (.5, -.6), ncol = 4, title = "Percent Makeup of Genre Category")