#Ali and Kendra Final Project
#importing pandas, csv,
import csv
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statistics
#To create testing and training dfs and labels
from sklearn.model_selection import train_test_split
#To get a count or tally of a category in our df
from collections import Counter
# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
#confusion matrix
from sklearn.metrics import confusion_matrix, classification_report
#for pre-processing to fit all numeric data on the standard scale
from sklearn.preprocessing import StandardScaler
#for applying PCA function on training and testing sets
from sklearn.decomposition import PCA
#logistic regression
from sklearn.linear_model import LogisticRegression
#SVMs
from sklearn.svm import SVC
#For association rule mining
from apyori import apriori
#This will allow us to silence the warnings
import warnings
warnings.simplefilter("ignore")
#For the confusion matrix
import seaborn as sns
#Functions that we are going to use in our file:
#Creating a function that will discretize our columns based on quartiles
def quartile_discretize(df, column, categories):
df[column] = pd.qcut(df[column], 4, labels = categories)
return(df[column])
#Creating a function that will merge our dfs with a left join
def left_merge_2_conditions(df1, df2, column1, column2):
df = pd.merge(df1, df2, how = "left", on=[column1, column2])
return(df)
#Creating a function that groups by, counts, creates a new column from the index, drops the index and changes the column names
def groupby_count(df, groupby_column, count_column):
new_df = pd.DataFrame(df.groupby(groupby_column)[count_column].count())
new_df.columns = ["count"]
new_df[groupby_column] = new_df.index.get_level_values(0)
new_df.reset_index(drop = True, inplace = True)
return(new_df)
#Creating a function that groups by, counts, creates a new column from the index, drops the index and changes the column names
def groupby_2_count(df, groupby_column1, groupby_column2, count_column):
new_df = pd.DataFrame(df.groupby([groupby_column1, groupby_column2 ])[count_column].count())
new_df.columns = ["count"]
new_df[groupby_column1] = new_df.index.get_level_values(0)
new_df[groupby_column2] = new_df.index.get_level_values(1)
new_df.reset_index(drop = True, inplace = True)
return(new_df)
#This will calculate the exponential moving average of the columns we want
#exponential moving averages give more weight to the most recent data and less weight to older data
def exp_moving_avg(d):
d["exp_moving_avg"] = d["score"].ewm(span=40,adjust=False).mean()
exp_moving_avg = list(d["exp_moving_avg"])
#Adding a 0 to the first entry to exp_moving_avg
exp_moving_avg = [0] + exp_moving_avg
#Removing the last entry in the list
exp_moving_avg.pop()
#Creating a column named exp_moving_avg with the results
d["exp_moving_avg"] = exp_moving_avg
return(exp_moving_avg)
#This will calculate the cumulative moving average
def cumulative_moving_avg(d):
d["moving_avg"] = d.expanding(min_periods = 1).mean()
moving_avg = list(d["moving_avg"])
#Adding a 0 to the first entry to moving avg
cumulative_moving_avg = [0] + moving_avg
#Removing the last entry in the list
cumulative_moving_avg.pop()
return(cumulative_moving_avg)
#This will get the list of all of the entries in the column that we are interested in for calculating the averages
def getting_list_of_entries(df, column_interested_in):
avg_people = pd.DataFrame(df.groupby([column_interested_in, "released"])["score"].mean())
avg_column_scores = pd.DataFrame()
column_interested = list(df[column_interested_in].unique())
return([avg_people, column_interested])
# Going to use matplotlib for plotting...
# To create a plot we followed the following formula:
# df.plot(x-axis, y-axis, kind = type of plot, color = [(we specified colors to use here)], legend = False (we did not
# want a legend displayed), title = "Title") then we added a ylabel with plt.ylabel("Type label here") and an x label
# with plt.xlabel("type label here"). Finally, we wanted to change the direction of the xtick names from a 90 degree angle
# to no angle with plt.xticks(rotation = rotation angle desired)
def bar_graph_count(df, x_column, y_column, title):
g = df.plot(x_column, y_column, kind = "bar", legend = False, title = title)
g = plt.ylabel(y_column)
g = plt.xlabel(x_column)
return(g)
#This will make a df for our moving averages that we are calculating
def making_df(people_df, column_interested_in, released, person, cumulative_avg, exp_avg):
df_2 = pd.DataFrame({column_interested_in: person, "released": released, "cumulative_mean": cumulative_avg, "exp_mean": exp_avg})
return(df_2)
#This includes the functions above, and will calculate the exponential and cumulative moving averages for which ever
#column we specify and return a df will the column interested in, released, cumulative_mean, exp_mean
def calculating_moving_avg(df, column_interested_in):
people_df = pd.DataFrame()
people = getting_list_of_entries(df, column_interested_in)
cumulative_avg = []
avg_people = people[0]
avg_people
for person in people[1]:
d = avg_people.groupby(column_interested_in).get_group(person)
cumulative_avg = cumulative_moving_avg(d)
exp_avg = exp_moving_avg(d)
d.reset_index(inplace = True)
released = d["released"]
df = pd.DataFrame({column_interested_in: person, "released": released, "cumulative_mean_"+column_interested_in : cumulative_avg, "exp_mean_"+column_interested_in: exp_avg})
people_df = people_df.append(df)
return(people_df)
#Confusion Matrix Graph Function
def confusion_matrix_graph (cm, accuracy_label, type_of_df):
g = plt.figure(figsize=(2,2))
g = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r', cbar = False);
g = plt.ylabel('Actual');
g = plt.xlabel('Predicted');
g = all_sample_title = type_of_df +' Accuracy Score: {0}'.format(round(accuracy_label, 4))
g = plt.title(all_sample_title, size = 12);
return(g)
#reading in the movies.csv file from Kaggle
movies = pd.read_csv("movies.csv", encoding = "ISO-8859-1")
len(movies)
#Looking at the head of the dataframe
movies.head()
#Getting the shape of the df
movies.shape
#We currently have 6,820 rows and 15 columns
#Checking to see if we have any missing values... It shows that we do not.
movies.isnull().sum()
# If we had missing values we would do the following
#We are dropping those rows with the following code
#movies.dropna(inplace = True)
#We are removing any movie that has a budget of 0, because for our machine learning, we will want to predict profit.
# movies in our df that do not contain gross or budget will not be useful for this.
movies = movies[movies["budget"] != 0]
#We are removing any movie with a gross of 0
movies = movies[movies["gross"] != 0]
# movies = movies[movies["production_companies"] != "[]"]
# movies = movies[movies["genres"] != "[]"]
len(movies)
#Checking data types of the columns
movies.dtypes
#Once we are done cleaning the data we are going to change the data types of: company, director, genre, rating, released
#star, writer, and potentially year. If we change them now, when we clean the df and removed rows, the old categories
#remain, and still show as possible categories.
#Need to change the following to date
#released,year
movies["released"] = pd.to_datetime(movies["released"])
#Separating the month, day and year into their own columns in case we would like to analyze based on month, day or year
movies["month"], movies["day"] = movies["released"].dt.month, movies["released"].dt.day
#Checking the data types of the columns and making sure the new columns were added
movies.dtypes
cat = list(range(1,13))
#Changing the month data type from int to ordered category
movies["month"] = pd.Categorical(movies["month"], ordered = True, categories = cat)
#Making sure it shows as an ordered factor
movies.month.dtype
movies.dtypes
#Getting a list of a the different ratings in our df
movies["rating"].unique()
#UNRATED, not specified AND NOT RATED mean the same thing, therefore we are going to change all not rated, not specified entries to unrated
movies["rating"] = movies["rating"].replace(["NOT RATED", "Not specified"], "UNRATED")
#Checking to make sure that worked:
movies["rating"].unique()
#Changing rating to an ordered factor
#Creating the order that we would like for the ordered factor
cat = ["UNRATED", "G", "PG", "PG-13", "R", "NC-17"]
#Changing to ordered factor
movies["rating"] = pd.Categorical(movies["rating"], ordered = True, categories = cat)
#Checking to see if it worked
movies.rating.dtype
#We want to be able to look at the profit for each movie... Therefore we are creating a
#profit column which is gross - budget
movies["profit"] = movies["gross"] - movies["budget"]
#Creating a percent profit column to have a normalized way to compare profits.
#percent_profit = profit/budget*100
movies["percent_profit"] = movies["profit"]/movies["budget"]*100
movies.head()
#Directors
#Aggregating a moving average column and calculating the mean average imdb score for each actor; by calculating the
#mean imdb scores for all actors but for only the movies prior to the movie we are calculting the mean for.
directors_avg = calculating_moving_avg(movies, "director")
#Writers:
writers_avg = calculating_moving_avg(movies, "writer")
#actors:
stars_avg = calculating_moving_avg(movies, "star")
#company:
companies_avg = calculating_moving_avg(movies, "company")
#We are going to use our left_merge_2_conditions function:
#Inputs: df1, df2, column to merge on 1 and column to merge on 2
movies = left_merge_2_conditions(movies, directors_avg, "director", "released")
movies = left_merge_2_conditions(movies, writers_avg, "writer", "released")
movies = left_merge_2_conditions(movies, stars_avg, "star", "released")
movies = left_merge_2_conditions(movies, companies_avg, "company", "released")
movies.head()
#Looking to see what happens if we remove all the movies with a 0 for exp_mean_director and exp_mean_star
movies = movies[movies["exp_mean_director"] != 0]
movies = movies[movies["exp_mean_star"] != 0]
movies = movies[movies["exp_mean_writer"] != 0]
movies = movies[movies["exp_mean_company"] != 0]
len(movies) #We still have 883 movies in our df
movies.head()
#Creating an aggregated column for the avg writer, director, company, actor cumulative mean
movies["cumulative_mean_avg"] = (movies["cumulative_mean_writer"] + movies["cumulative_mean_director"] +
movies["cumulative_mean_company"] + movies["cumulative_mean_star"])/4
# Creating an aggregated column for the avg writer, director, company,
# and actor exponential mean
movies["exp_mean_avg"] = (movies["exp_mean_writer"] + movies["exp_mean_director"] +
movies["exp_mean_company"] + movies["exp_mean_star"])/4
movies.head()
#What is the breakdown of genre in our df?
#Getting the count of movies for each genre in our df and saving it as a pandas df.
#We are grouping by genre and then getting the count of the genre column in each group by
#we could have used any column to get the count of...
#We are using the groupby_count function that takes the following arguments (df, groupby_column, count_column)
movies_genre = groupby_count(movies, "genre", "genre")
movies_genre
#Sorting the df, so the bar graph will be in descending order
movies_genre.sort_values(['count'], ascending=[False], inplace = True)
#Creating a graph of the movies_genre df using our bar_graph_count function. It takes the following inputs:
# df, x-column, y_column, and title
bar_graph_count(movies_genre, "genre", "count", "Visualization of Number of Movies Per Genre")
#Creating a data frame of the movies star count
movies_star = groupby_count(movies, "star", "genre")
movies_star.head()
# #Creating a subset of the movies_star df that contains only stars that have 2+ movies in our df
# movies_star = movies_star[movies_star["count"] > 1]
# #Creating a list of the stars that are in our subsetted df
# movies_star_list = list(movies_star["star"])
# movies_star_list
# #Subsetting our movies df to include only stars who are listed in our movies_star_list
# movies = movies[movies.star.isin(movies_star_list)]
movies_star.describe()
#The majority of our 356 stars. Only 25% of our stars in 3 or more movies.
# We have 649 stars in our newly reduced df
# How many movies have each star in our df starred in?
#Looking to see how many stars have starred in 1, 2, 3, 4+ movies
movies_star = groupby_count(movies, "star", "genre")
#Changing the column names for our movies_star
movies_star.columns = ["number_of_movies", "star"]
movies_star = groupby_count(movies_star, "number_of_movies", "star")
movies_star
#Changing the column names
movies_star.columns = ["number_of_stars", "number_of_movies"]
#Sorting the df, so the bar graph will be in descending order
movies_star.sort_values(['number_of_movies'], ascending=[True], inplace = True)
#Creating a graph of the movies_star df using our bar_graph_count function. It takes the following inputs:
# df, x-column, y_column, and title
bar_graph_count(movies_star, "number_of_movies", "number_of_stars", "Visualization of the The Number of Movies each Star has in our DF")
#Creating a data frame of the movies director count
movies_director = groupby_count(movies, "director", "genre")
movies_director.columns = ["director_count", "director"]
# #Creating a subset of the movies_director df that contains only directors that have 2 movies in our df
# movies_director = movies_director[movies_director["director_count"] > 1]
# #Creating a list of the stars that are in our subsetted df
# movies_director_list = list(movies_director["director"])
# movies_director_list
# #Subsetting our movies df to include only stars who are listed in our movies_star_list
# movies = movies[movies.director.isin(movies_director_list)]
# How many movies have each director in our df produced?
#Creating a new groupby to see how many directors we have that have produced 1, 2, 3, 4... movies
movies_director = groupby_count(movies_director, "director_count", "director")
#Getting a tally of the number of directors that have the same number of movies
movies_director.columns = ["number_of_movies", "director_count"]
movies_director
#Creating a graph of the movies_director df
bar_graph_count(movies_director, "director_count", "number_of_movies", "Visualization of the The Number of Movies each Director has in our DF")
# How many movies have each company in our df produced?
#Creating a new groupby to see how many company we have that have produced 1, 2, 3, 4... movies
movies_company = groupby_count(movies, "company", "star")
movies_company.columns = ["number_of_movies", "company"]
movies_company = groupby_count(movies_company, "number_of_movies", "company")
movies_company.columns = ["number_of_companies", "number_of_movies"]
movies_company
#Creating a graph of the movies_company df
bar_graph_count(movies_company, "number_of_movies", "number_of_companies", "Visualization of the The Number of Movies each Company has in our DF")
#How are the movies in our df distributed by year?
#Looking to see how the movies are distributed by year
movies_year = groupby_count(movies, "year", "star")
movies_year
#Creating a graph of the movies_year df
bar_graph_count(movies_year, "year", "count", "Visualization of Number of Movies Per Year")
# What is the breakdown of the number of movies per genre per year?
#Looking at the number of movies per genre per year
#Looking to see how the movies are distributed by year
#We are using the groupby_2 function that takes the following arguements:
#(df, groupby_column1, groupby_column2, agg_column)
movies_year_genre = groupby_2_count(movies, "year", "genre", "star")
movies_year_genre.head()
#To visualize this, we are going to use a grouped bar graph. Our bar_graph_count function does not apply to this
#situation... We are going to use the pivot function
# Attempting to graph this data using a grouped bar chart:
# formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"],
# title = "you can set the title of your graph here")
movies_year_genre.pivot("year", "genre", "count").plot(kind="bar", title = "Visualization of Genre Breakdown by Year")
#Creating a y axis label
plt.ylabel("Number of Movies")
#Changing the x axis label
plt.xlabel("Year")
#Changing the label to display male and female instead of 0 and 1
plt.legend(loc = "lower center", bbox_to_anchor = (.5, -.8), ncol = 4, title = "Genre")
#This was not very helpful, we are going to subset the data by 5 year chunks and regraph
'''Come back here if time prevails to do what is said above'''
#What are the most prolific months for movies to be released in our dataset?
#Looking at month
movies_month = groupby_count(movies, "month", "star")
movies_month
#Creating a new column called month with the names of the month of the year, since the df is ordered, we know that
#the months of the year can be added this way
movies_month["month"] = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"]
#Changing the data type of the month column to ordered categorical
cat = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"]
movies_month["month"] = pd.Categorical(movies_month["month"], ordered = True, categories = cat)
movies_month.month.dtype
movies_month
bar_graph_count(movies_month, "month", "count", "Visualization of Number of Movies by Month")
#Descritizing the df based on all of the numeric columns:
#Creating a new df name movies_descritized
movies_discretized = movies.copy()
movies_discretized.columns
# We are going to descritize our data based on the quartiles. The categories are:
# extremely_low, low, high, extremely_high
# We are using our quartile_discretize function that takes the following arguments:
#(df, column, category)
categories = ["extremely_low", "low", "high", "extremely_high"]
movies_discretized["budget"] = quartile_discretize(movies_discretized, "budget", categories)
#Checking to make sure it worked
movies_discretized.budget.dtype
#Gross: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["gross"] = quartile_discretize(movies_discretized, "gross", categories)
#Checking to see that it worked
movies_discretized.gross.dtype
#Score: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["score"] = quartile_discretize(movies_discretized, "score", categories)
#Checking to see that it worked
movies_discretized.score.dtype
#Votes: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["votes"] = quartile_discretize(movies_discretized, "votes", categories)
movies_discretized.votes.dtype
#writer_mean: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["cumulative_mean_writer"] = quartile_discretize(movies_discretized,"cumulative_mean_writer", categories)
movies_discretized["exp_mean_writer"] = quartile_discretize(movies_discretized,"exp_mean_writer", categories)
movies_discretized["cumulative_mean_director"] = quartile_discretize(movies_discretized, "cumulative_mean_director", categories)
movies_discretized["exp_mean_director"] = quartile_discretize(movies_discretized,"exp_mean_director", categories)
movies_discretized["cumulative_mean_star"] = quartile_discretize(movies_discretized, "cumulative_mean_star", categories)
movies_discretized["exp_mean_star"] = quartile_discretize(movies_discretized, "exp_mean_star", categories)
movies_discretized["cumulative_mean_company"] = quartile_discretize(movies_discretized, "cumulative_mean_company", categories)
movies_discretized["exp_mean_company"] = quartile_discretize(movies_discretized, "exp_mean_company", categories)
#We are creating new label categories
categories = ["extremely_short", "short", "long", "extremely_long"]
movies_discretized["runtime"] = quartile_discretize(movies_discretized, "runtime", categories)
#Checking to see if that worked
movies_discretized.runtime.dtype
movies_discretized.percent_profit.describe()
#We are creating new label categories ; Discretized Percent Profit
#We cannot use our function on this, because we are not discretizing by quartiles
categories = ["negative", "low", "high", "extremely_high"]
movies_discretized["percent_profit"] = pd.cut(movies_discretized["percent_profit"], [-100, 0, 50, 150, 999999], labels = categories)
#Checking to see if it worked
movies_discretized.percent_profit.dtype
movies_discretized.profit.describe()
#Profit
#We cannot use our function on this, because we are not discretizing by quartiles
movies_discretized["profit"] = pd.cut(movies_discretized["profit"], [-9999999999, 0, 1000000, 50000000, 999999999], labels = categories)
movies_discretized.head()
#We are setting new categories for the day column by creating a new column for week
# week_1 is the first 7 days of the month, week_2 is days 8 - 14, week_3 is days 15 - 21, and week_4 are the
# rest of the days
categories = ["week_1", "week_2", "week_3", "week_4"]
movies_discretized["week"] = pd.cut(movies_discretized["day"], [0, 8, 15, 22, 32], labels = categories)
movies_discretized.week.dtype
#Looking at the relationship between genre and percent profit
movies_discretized_genre_pp = groupby_2_count(movies_discretized, "genre", "percent_profit", "score")
movies_discretized_genre_pp.head()
#Now we are getting the sum of each genre category... We do not have a function for sum... we could go back and rework
#our function.
movies_discretized_genre_pp.groupby("genre")["count"].sum()
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre"]
'''We ultimately want a column that contains the total counts for each genre group. We are probably doing this in
a roundabout way, but as I am extremely new to python this is the best way I can think of doing it. We are going to create
a new column that replicates the genre column called genre_count and then we will use the replace function to
replace the genre names with their total count '''
#First, replicating the income level column in a column named budget_category_count
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre"]
#Now replacing the income level with the total count for each income level
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Action"], 301 )
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Adventure"], 54)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Animation"], 33)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Biography"], 38)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Comedy"], 233)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Crime"], 69)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Drama"], 116)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Fantasy"], 4)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Horror"], 27)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Mystery"], 7)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Romance"], 1)
movies_discretized_genre_pp.head()
movies_discretized_genre_pp["genre_count"] = pd.to_numeric(movies_discretized_genre_pp["genre_count"])
#Okay, we are one step closer... Now, we need to create a column that takes the counts/genre_counts * 100
movies_discretized_genre_pp["percent"] = movies_discretized_genre_pp["count"]/movies_discretized_genre_pp["genre_count"] *100
movies_discretized_genre_pp.head()
# Attempting to graph this data using a grouped bar chart:
# formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"],
# title = "you can set the title of your graph here")
graph = movies_discretized_genre_pp.pivot("genre", "percent_profit",
"percent").plot(kind="bar", color = ["crimson", "salmon", "palegreen", "darkgreen"],
title = "Percent of Percent Profit to Genre Category")
#Changing the y label of our graph to Percent
plt.ylabel("Percent")
#Changing the x axis label of our graph to Budget Category
plt.xlabel("Genre")
#How to change the tick labels (we ended up not needing this, but want to keep for future reference)
#plt.Axes.set_xticklabels(graph, labels = ['extremely low', 'low', 'high', 'extremely high'])
#moving the legend position to underneath the graph, also setting it to have 4 columns so the legend is in a
#straight single line and adding a legend title
plt.legend( loc = "lower center", bbox_to_anchor = (.5, -.6), ncol = 4, title = "Percent Makeup of Genre Category")
##############################################################################################################
# We are going to implement machine learning to see if we can predict either tmdb score or percent profit
##############################################################################################################
#
##############################################################################################################
# We are going to create an all numeric df, that does not contain the profit, percent profit, votes, and gross.
# This information would give us an unfair look when trying to predict either score or percent profit, as these
# metrics would not be available prior to the release of the movie.
################################################################################################################
# Potential Machine Learning algorithms we can use on our all numeric df:
# logistic regression
# Naive Bayes
# k-means
# Decision Tree
# knn
# svm
################################################################################################################
#
#
################################################################################################################
# We are also going to create an all discretized df
################################################################################################################
################################################################################################################
# Potential Machine Learning algorithms we can use on our discretized df:
# Decision Tree
# Association Rule Mining
# Random Forest
################################################################################################################
#################################################################
#Naive Bayes
#**All Numerica Data ***
#################################################################
#Creating testing and training datasets comprised of all numeric data for score
test_train_movies = movies.copy()
test_train_movies_score = test_train_movies.copy()
test_train_movies_score.score.describe()
#We believe that people will be most interested in if they will get a high imdb score and not if they will have an
#average or low score... Due to this, we have decided to discretize score in 2 categories: high and not_high.
#We have decided that a high score is 7+.
test_train_movies_score = test_train_movies_score.copy()
categories = ["not_high", "high"]
# not_high = 0 - 7
# high = 7 - 10
test_train_movies_score["score"] = pd.cut(test_train_movies_score["score"], [0, 7, 10], labels = categories)
#Getting a count of the number of movies that are classified as high and not high in our df.
test_train_movies_score_count = test_train_movies_score.groupby("score")["score"].count()
test_train_movies_score_count
# We are going to create a testing and training df that contains 242 not_high entries and 242 high entries
#First we are going to subset the not_high scores and the high scores
not_high = test_train_movies_score[test_train_movies_score["score"] == "not_high"]
test_train_movies_score = test_train_movies_score[test_train_movies_score["score"] == "high"]
#Getting the length to make sure that we have 641 movies
len(not_high)
#Checking to make sure that the test_train_movies_score is equal to the 242 high movies
len(test_train_movies_score)
#Now getting a random sample of 242 entries in the not_high df and setting the seed to 123 to reproduce the results
not_high = not_high.sample(n = 242, random_state = 123)
#Getting the length to make sure that it worked
len(not_high)
#Adding the not_high movies back to the test_train_movies_score df
test_train_movies_score = pd.concat([test_train_movies_score, not_high])
len(test_train_movies_score)
#Changing the data type of month day and year to numeric
columns = ["month", "day"]
test_train_movies_score[columns] = test_train_movies_score[columns].apply(pd.to_numeric)
# We need to remove gross, votes, profit, and percent profit (these columns give an unfair look at the potential
# imdb rating) and all non numeric data
#We are creating a test_train_movies_score_exp df that includes the exponential moving averages. We are interested to
#see if the cumulative or exponential moving averages will help our algorithm the most.
#Using the exponential moving average to try to predict score first
columns = list(test_train_movies_score.columns)
columns = ['budget', 'runtime', 'score', 'month', 'day', 'exp_mean_writer', 'exp_mean_director',
'exp_mean_star', 'exp_mean_company', 'exp_mean_avg']
test_train_movies_score_exp = test_train_movies_score[columns].copy()
#Creating a test_train_movies_score_cumulative df
#We need to remove gross, votes, profit, and percent profit (these columns give an unfair look at the potential imdb rating) and all non numeric data
#Getting a list of column names, so we can copy and paste it to select the columns we want.
columns = list(test_train_movies_score.columns)
columns = ['budget', 'runtime', 'score', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
'cumulative_mean_star', 'cumulative_mean_company', 'cumulative_mean_avg']
test_train_movies_score_cumulative = test_train_movies_score[columns].copy()
#Creating a df that contains both the cumulative and exponential moving averages
columns = ['budget', 'runtime', 'score', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
'cumulative_mean_star', 'cumulative_mean_company', 'cumulative_mean_avg', 'exp_mean_writer', 'exp_mean_director',
'exp_mean_star', 'exp_mean_company', 'exp_mean_avg']
test_train_movies_score_cumulative_exp = test_train_movies_score[columns].copy()
# #removing the label from the test_train_movies_score_exp df and saving it in a label df
test_train_movies_score_exp_label = test_train_movies_score_exp["score"]
test_train_movies_score_exp.drop("score", axis = 1, inplace = True)
#repeating the process for cumulative
test_train_movies_score_cumulative_label = test_train_movies_score_cumulative["score"]
test_train_movies_score_cumulative.drop("score", axis = 1, inplace = True)
# #repeating the process for the cumulative and exp combined dfs
test_train_movies_score_cumulative_exp_label = test_train_movies_score_cumulative_exp["score"]
test_train_movies_score_cumulative_exp.drop("score", axis = 1, inplace = True)
#Creating 4 dfs: 1: the training df with label removed, 2: the testing df with label removed, 3: the training label, 4: testing label
#For each test_train df
from sklearn.model_selection import train_test_split
score_exp_train, score_exp_test, score_exp_train_label, score_exp_test_label = train_test_split(test_train_movies_score_exp, test_train_movies_score_exp_label, test_size = .3, random_state = 123)
score_cumulative_train, score_cumulative_test, score_cumulative_train_label, score_cumulative_test_label = train_test_split(test_train_movies_score_cumulative, test_train_movies_score_cumulative_label, test_size = .3, random_state = 123)
score_cumulative_exp_train, score_cumulative_exp_test, score_cumulative_exp_train_label, score_cumulative_exp_test_label = train_test_split(test_train_movies_score_cumulative_exp, test_train_movies_score_cumulative_exp_label, test_size = .3, random_state = 123)
#Getting a count of high scores in our test label and not_high scores in out test label
#We would prefer to have an equal or almost equal number of movies classified as high and not high in our test set
Counter(score_exp_test_label)
#Using the standard scale to help preprocess and normalize the data
# performing preprocessing part
sc = StandardScaler()
score_exp_train = sc.fit_transform(score_exp_train)
score_exp_test = sc.transform(score_exp_test)
score_cumulative_train = sc.fit_transform(score_cumulative_train)
score_cumulative_test = sc.transform(score_cumulative_test)
score_cumulative_exp_train = sc.fit_transform(score_cumulative_exp_train)
score_cumulative_exp_test = sc.transform(score_cumulative_exp_test)
#Attempt 1: all variables
clf = GaussianNB()
clf.fit(score_exp_train, score_exp_train_label)
test_predicted_exp_nb = clf.predict(score_exp_test)
clf.fit(score_cumulative_train, score_cumulative_train_label)
test_predicted_cumulative_nb = clf.predict(score_cumulative_test)
clf.fit(score_cumulative_exp_train, score_cumulative_exp_train_label)
test_predicted_cumulative_exp_nb = clf.predict(score_cumulative_exp_test)
#Accuracy for exp
exp_accuracy_nb = accuracy_score(score_exp_test_label, test_predicted_exp_nb, normalize = True)
cm = confusion_matrix(score_exp_test_label, test_predicted_exp_nb)
confusion_matrix_graph(cm, exp_accuracy_nb, "Exp")
print(classification_report(score_exp_test_label, test_predicted_exp_nb))
#Accuracy for cumulative
cum_accuracy_nb = accuracy_score(score_cumulative_test_label, test_predicted_cumulative_nb, normalize = True)
cm = confusion_matrix(score_cumulative_test_label, test_predicted_cumulative_nb)
confusion_matrix_graph(cm, cum_accuracy_nb, "Cumulative")
print(classification_report(score_cumulative_test_label, test_predicted_cumulative_nb))
#Accuracy for cumulative and exp
cum_exp_accuracy_nb = accuracy_score(score_cumulative_exp_test_label, test_predicted_cumulative_exp_nb,
normalize = True)
cm = confusion_matrix(score_cumulative_exp_test_label, test_predicted_cumulative_exp_nb)
confusion_matrix_graph(cm, cum_exp_accuracy_nb, "Cumulative & Exp")
print(classification_report(score_cumulative_exp_test_label, test_predicted_cumulative_exp_nb))
#The Naive Bayes with the cumulative mean, proved better than the other options... While, it did correctly classify
# less high scores than the other two models. It had a much better accuracy classifying low scores as well as high
#scores. It correctly classifying 57 out of 74 high scores and 52 out of 72 not high scores... The other models
#suffered when classifying not high scores...
###########################################################################################
# PCA and Logistic Regression
###########################################################################################
#Running PCA (Principal Component Analysis) to see which variables are the most beneficial for our prediction
# pca = PCA(n_components = 2)
# score_exp_train_pca = pca.fit_transform(score_exp_train)
# score_exp_test_pca = pca.transform(score_exp_test)
# explained_variance = pca.explained_variance_ratio_
# explained_variance
#Fitting Logistic Regression
classifier = LogisticRegression(random_state = 723)
classifier.fit(score_exp_train, score_exp_train_label)
#Predicting the test labels
test_predicted_exp_lg = classifier.predict(score_exp_test)
#getting the accuracy
exp_accuracy_lg = accuracy_score(score_exp_test_label, test_predicted_exp_lg, normalize = True)
cm = confusion_matrix(score_exp_test_label, test_predicted_exp_lg)
confusion_matrix_graph(cm, exp_accuracy_lg, "LG Exp")
print(classification_report(score_exp_test_label, test_predicted_exp_lg))
#For the cumulative avg
classifier.fit(score_cumulative_train, score_cumulative_train_label)
#Predicting the test labels
test_predicted_cumulative_lg = classifier.predict(score_cumulative_test)
#getting the accuracy
cum_accuracy_lg =accuracy_score(score_cumulative_test_label, test_predicted_cumulative_lg, normalize = True)
#Looking at the confusion matrix
cm = confusion_matrix(score_cumulative_test_label, test_predicted_cumulative_lg)
confusion_matrix_graph(cm, cum_accuracy_lg, "LG Cumulative")
print(classification_report(score_cumulative_test_label, test_predicted_cumulative_lg))
#For the cumulative avg
classifier.fit(score_cumulative_exp_train, score_cumulative_exp_train_label)
#Predicting the test labels
test_predicted_cumulative_exp_lg = classifier.predict(score_cumulative_exp_test)
#getting the accuracy
cum_exp_accuracy_lg = accuracy_score(score_cumulative_exp_test_label, test_predicted_cumulative_exp_lg, normalize = True)
cm = confusion_matrix(score_cumulative_exp_test_label, test_predicted_cumulative_exp_lg)
confusion_matrix_graph(cm, cum_exp_accuracy_lg, "LG Cumulative & Exp")
print(classification_report(score_cumulative_exp_test_label,test_predicted_cumulative_exp_lg))
#Out of the 3 different testing and training dfs the df with the exponential moving averages proved best for our
# logistic regression models. We had an overall accuracy of 72.6% and accurately classified high scores 74.4% and
# not_high scores 64.1% of the time...
###########################################################################################
# SVMS
###########################################################################################
#Cumulative SVM - Fitting the classifier
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(score_cumulative_train, score_cumulative_train_label)
#Making the predictions
test_predicted_cum_svm = svclassifier.predict(score_cumulative_test)
#Creating my confusion matrix
cum_accuracy_svm = accuracy_score(score_cumulative_test_label, test_predicted_cum_svm, normalize = True)
cm = confusion_matrix(score_cumulative_test_label, test_predicted_cum_svm)
confusion_matrix_graph(cm, cum_accuracy_svm, "SVM Cumulative")
print(classification_report(score_cumulative_test_label, test_predicted_cum_svm))
#Exp SVM - Fitting the classifier
svclassifier = SVC(kernel='linear')
svclassifier.fit(score_exp_train, score_exp_train_label)
#Making the predictions
test_predicted_exp_svm = svclassifier.predict(score_exp_test)
#Creating my confusion matrix
exp_accuracy_svm = accuracy_score(score_exp_test_label, test_predicted_exp_svm, normalize = True)
cm = confusion_matrix(score_exp_test_label, test_predicted_exp_svm)
confusion_matrix_graph(cm, exp_accuracy_svm, "SVM Exp")
print(classification_report(score_exp_test_label, test_predicted_exp_svm))
#Exp & Cum SVM - Fitting the classifier
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(score_cumulative_exp_train, score_cumulative_exp_train_label)
#Making the predictions
test_predicted_cum_exp_svm = svclassifier.predict(score_cumulative_exp_test)
#Creating my confusion matrix
cum_exp_accuracy_svm = accuracy_score(score_cumulative_exp_test_label, test_predicted_cum_exp_svm,
normalize = True)
cm = confusion_matrix(score_cumulative_exp_test_label, test_predicted_cum_exp_svm)
confusion_matrix_graph(cm, cum_exp_accuracy_svm, "SVM Exp & Cumulative")
print(classification_report(score_cumulative_exp_test_label, test_predicted_cum_exp_svm))
################################################################################################
#
# Now looking to see if we can predict percent profit
#
################################################################################################
#We will be using the same columns, but instead of having score, we will have percent_profit.
#We are interested in if a movie will make a profit or not. Therefore, we are only discretizing into 2 categories:
#negative and postive.
test_train_movies_pp = test_train_movies_score.copy()
categories = ["negative", "positive"]
# Negative anything less than 0
#positive anything greater than 0
test_train_movies_pp["percent_profit"] = pd.cut(test_train_movies_pp["percent_profit"], [-101, 0, 999999], labels = categories)
#Getting the count of each category in our df
test_train_movies_pp_count = test_train_movies_pp.groupby("percent_profit")["percent_profit"].count()
test_train_movies_pp_count
# We are going to create a testing and training df that contains 198 negative, 198 positive percent_profits
#First we are going to subset the positive percent profits and the negative per+cent_profits
positive = test_train_movies_pp[test_train_movies_pp["percent_profit"] == "positive"]
test_train_movies_pp = test_train_movies_pp[test_train_movies_pp["percent_profit"] == "negative"]
#Getting the length to make sure that we have 198 negative, 286 postive in our df
print(len(positive))
print(len(test_train_movies_pp))
#Now getting a random sample of 198 entries in the positive df and setting the seed to 123
#to reproduce the results
positive = positive.sample(n = 198, random_state = 123)
#Getting the length to make sure that it worked
print(len(positive))
#Adding the positive movies back to the test_train_movies_pp df
test_train_movies_pp = pd.concat([test_train_movies_pp, positive])
#Getting the length to make sure that the 2 df were combined correctly and if it did we would have 396 movies in our df
len(test_train_movies_pp)
#Changing the data type of month day and year to numeric
columns = ["month", "day"]
test_train_movies_pp[columns] = test_train_movies_pp[columns].apply(pd.to_numeric)
'''We need to remove gross, votes, profit, and score (these columns give an unfair look at the potential
imdb rating) and all non numeric data
'''
#Using the exponential moving average to try to predict score first
columns = list(test_train_movies_pp.columns)
columns = ['budget', 'runtime', 'percent_profit', 'month', 'day', 'exp_mean_writer', 'exp_mean_director',
'exp_mean_star', 'exp_mean_company', 'exp_mean_avg']
test_train_movies_pp_exp = test_train_movies_pp[columns].copy()
#We need to remove gross, votes, profit, and percent profit (these columns give an unfair look at the potential imdb rating) and all non numeric data
#Getting a list of column names, so we can copy and paste it to select the columns we want.
columns = list(test_train_movies_pp.columns)
columns = ['budget', 'runtime', 'percent_profit', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
'cumulative_mean_star', 'cumulative_mean_company', 'cumulative_mean_avg']
test_train_movies_pp_cumulative = test_train_movies_pp[columns].copy()
columns = ['percent_profit', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
'cumulative_mean_star', 'cumulative_mean_company', 'cumulative_mean_avg', 'exp_mean_writer', 'exp_mean_director',
'exp_mean_star', 'exp_mean_company', 'exp_mean_avg']
test_train_movies_pp_cumulative_exp = test_train_movies_pp[columns].copy()
# #removing the label from the test_train_movies_spp df and saving it in a label df
test_train_movies_pp_exp_label = test_train_movies_pp_exp["percent_profit"]
test_train_movies_pp_exp.drop("percent_profit", axis = 1, inplace = True)
#repeating the process for cumulative
test_train_movies_pp_cumulative_label = test_train_movies_pp_cumulative["percent_profit"]
test_train_movies_pp_cumulative.drop("percent_profit", axis = 1, inplace = True)
# #repeating the process for the cumulative and exp combined dfs
test_train_movies_pp_cumulative_exp_label = test_train_movies_pp_cumulative_exp["percent_profit"]
test_train_movies_pp_cumulative_exp.drop("percent_profit", axis = 1, inplace = True)
#Creating 4 df: 1: the training df with label removed, 2: the testing df with label removed, 3: the training label, 4: testing label
from sklearn.model_selection import train_test_split
pp_exp_train, pp_exp_test, pp_exp_train_label, pp_exp_test_label = train_test_split(test_train_movies_pp_exp, test_train_movies_pp_exp_label, test_size = .3, random_state = 123)
pp_cumulative_train, pp_cumulative_test, pp_cumulative_train_label, pp_cumulative_test_label = train_test_split(test_train_movies_pp_cumulative, test_train_movies_pp_cumulative_label, test_size = .3, random_state = 123)
pp_cumulative_exp_train, pp_cumulative_exp_test, pp_cumulative_exp_train_label, pp_cumulative_exp_test_label = train_test_split(test_train_movies_pp_cumulative_exp, test_train_movies_pp_cumulative_exp_label, test_size = .3, random_state = 123)
#Getting a count of percent_profit in our test label scores in out test label
#We want to have roughly the same number of positive and negative movies in our test df
Counter(pp_exp_test_label)
#Using the standard scale to help preprocess and normalize the data
# performing preprocessing part
sc = StandardScaler()
pp_exp_train = sc.fit_transform(pp_exp_train)
pp_exp_test = sc.transform(pp_exp_test)
pp_cumulative_train = sc.fit_transform(pp_cumulative_train)
pp_cumulative_test = sc.transform(pp_cumulative_test)
pp_cumulative_exp_train = sc.fit_transform(pp_cumulative_exp_train)
pp_cumulative_exp_test = sc.transform(pp_cumulative_exp_test)
#Attempt 1: all variables
clf = GaussianNB()
clf.fit(pp_exp_train, pp_exp_train_label)
test_predicted_exp_svm = clf.predict(pp_exp_test)
clf.fit(pp_cumulative_train, pp_cumulative_train_label)
test_predicted_cumulative_svm = clf.predict(pp_cumulative_test)
clf.fit(pp_cumulative_exp_train, pp_cumulative_exp_train_label)
test_predicted_cumulative_exp_svm = clf.predict(pp_cumulative_exp_test)
#Accuracy for exp
exp_accuracy_svm = accuracy_score(pp_exp_test_label, test_predicted_exp_svm, normalize = True)
cm = confusion_matrix(pp_exp_test_label, test_predicted_exp_svm)
confusion_matrix_graph(cm, exp_accuracy_svm, "SVM Exp")
print(classification_report(pp_exp_test_label, test_predicted_exp_svm))
#Accuracy for cumulative
cum_accuracy_svm = accuracy_score(pp_cumulative_test_label, test_predicted_cumulative_svm, normalize = True)
cm = confusion_matrix(pp_cumulative_test_label, test_predicted_cumulative_svm)
confusion_matrix_graph(cm, cum_accuracy_svm, "SVM Cumulative")
#Accuracy for cumulative and exp
exp_cum_accuracy_svm = accuracy_score(pp_cumulative_exp_test_label, test_predicted_cumulative_exp_svm, normalize = True)
cm = confusion_matrix(pp_cumulative_exp_test_label, test_predicted_cumulative_exp_svm)
confusion_matrix_graph(cm, exp_cum_accuracy_svm, "SVM Exp & Cumulative")
###########################################################################################
# PCA and Logistic Regression
###########################################################################################
#Fitting Logistic Regression
classifier = LogisticRegression(random_state = 723)
classifier.fit(pp_exp_train, pp_exp_train_label)
#Predicting the test labels
test_predicted_exp_lg = classifier.predict(pp_exp_test)
#getting the accuracy
exp_accuracy_lg = accuracy_score(pp_exp_test_label, test_predicted_exp_lg, normalize = True)
cm = confusion_matrix(pp_exp_test_label, test_predicted_exp_lg)
confusion_matrix_graph(cm, exp_accuracy_lg, "LG Exp")
print(classification_report(pp_exp_test_label, test_predicted_exp_lg))
#Fitting Logistic Regression
classifier = LogisticRegression(random_state = 723)
classifier.fit(pp_cumulative_train, pp_cumulative_train_label)
#Predicting the test labels
test_predicted_cum_lg = classifier.predict(pp_exp_test)
#getting the accuracy
cum_accuracy_lg = accuracy_score(pp_cumulative_test_label, test_predicted_cum_lg, normalize = True)
cm = confusion_matrix(pp_cumulative_test_label, test_predicted_cum_lg)
confusion_matrix_graph(cm, cum_accuracy_lg, "LG Cumulative")
print(classification_report(pp_cumulative_test_label, test_predicted_cum_lg))
#Fitting Logistic Regression
classifier = LogisticRegression(random_state = 723)
classifier.fit(pp_cumulative_exp_train, pp_cumulative_exp_train_label)
#Predicting the test labels
test_predicted_cum_exp_lg = classifier.predict(pp_cumulative_exp_test)
#getting the accuracy
cum_exp_accuracy_lg = accuracy_score(pp_cumulative_exp_test_label, test_predicted_cum_exp_lg, normalize = True)
cm = confusion_matrix(pp_cumulative_exp_test_label, test_predicted_cum_exp_lg)
confusion_matrix_graph(cm, cum_exp_accuracy_lg, "LG Exp & Cumulative")
print(classification_report(pp_cumulative_exp_test_label, test_predicted_cum_exp_lg))
###########################################################################################
# SVMS
###########################################################################################
#Cumulative SVM - Fitting the classifier
svclassifier = SVC(kernel='rbf')
svclassifier.fit(pp_cumulative_train, pp_cumulative_train_label)
#Making the predictions
test_predicted_cum_svm = svclassifier.predict(pp_cumulative_test)
#Creating my confusion matrix
cum_accuracy_svm = accuracy_score(pp_cumulative_test_label, test_predicted_cum_svm, normalize = True)
cm = confusion_matrix(pp_cumulative_test_label, test_predicted_cum_svm)
confusion_matrix_graph(cm, cum_accuracy_svm, "SVM Cumulative")
print(classification_report(pp_cumulative_test_label, test_predicted_cum_svm))
#Exp SVM - Fitting the classifier
svclassifier = SVC(kernel='linear')
svclassifier.fit(pp_exp_train, pp_exp_train_label)
#Making the predictions
test_predicted_exp_svm = svclassifier.predict(pp_exp_test)
#Creating my confusion matrix
exp_accuracy_svm = accuracy_score(pp_exp_test_label, test_predicted_exp_svm, normalize = True)
cm = confusion_matrix(pp_exp_test_label, test_predicted_exp_svm)
confusion_matrix_graph(cm, exp_accuracy_svm, "SVM Exp")
print(classification_report(pp_exp_test_label, test_predicted_exp_svm))
#Exp & Cum SVM - Fitting the classifier
svclassifier = SVC(kernel='rbf')
svclassifier.fit(pp_cumulative_exp_train, pp_cumulative_exp_train_label)
#Making the predictions
test_predicted_cum_exp_svm = svclassifier.predict(pp_cumulative_exp_test)
#Creating my confusion matrix
cum_exp_accuracy_svm = accuracy_score(pp_cumulative_exp_test_label, test_predicted_cum_exp_svm,
normalize = True)
cm = confusion_matrix(pp_cumulative_exp_test_label, test_predicted_cum_exp_svm)
confusion_matrix_graph(cm, cum_exp_accuracy_svm, "SVM Exp & Cumulative")
print(classification_report(pp_cumulative_exp_test_label, test_predicted_cum_exp_svm))
# We had much more success when predicting score versus percent profit... However, the SVM with Linear Kernel on
#the exponential df did have a 57% accuracy.
#We believe that in order to have a better accuracy when predicting percent profit, we need to have more data. Our
#next step is to find more data and then use the information gained from this analysis on our new data.