IMDB | IST 652 FINAL PROJECT | ALI HO & KENDRA OSBURN

=======================================================

PART 1: COMPARISON to R

=======================================================

In [2]:
#Ali and Kendra Final Project 

#importing pandas, csv, 
import csv 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statistics
#To create testing and training dfs and labels 
from sklearn.model_selection import train_test_split 
#To get a count or tally of a category in our df 
from collections import Counter
# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
#confusion matrix 
from sklearn.metrics import confusion_matrix, classification_report
#for pre-processing to fit all numeric data on the standard scale 
from sklearn.preprocessing import StandardScaler
#for applying PCA function on training and testing sets 
from sklearn.decomposition import PCA
#logistic regression 
from sklearn.linear_model import LogisticRegression
#SVMs 
from sklearn.svm import SVC
#For association rule mining 
from apyori import apriori
#This will allow us to silence the warnings
import warnings 
warnings.simplefilter("ignore")
#For the confusion matrix 
import seaborn as sns
In [3]:
#Functions that we are going to use in our file: 

#Creating a function that will discretize our columns based on quartiles 
def quartile_discretize(df, column, categories): 
    df[column] = pd.qcut(df[column], 4, labels = categories)
    return(df[column])

#Creating a function that will merge our dfs with a left join
def left_merge_2_conditions(df1, df2, column1, column2):
    df = pd.merge(df1, df2, how = "left", on=[column1, column2])
    return(df)

#Creating a function that groups by, counts, creates a new column from the index, drops the index and changes the column names
def groupby_count(df, groupby_column, count_column): 
    new_df = pd.DataFrame(df.groupby(groupby_column)[count_column].count())
    new_df.columns = ["count"]
    new_df[groupby_column] = new_df.index.get_level_values(0)
    new_df.reset_index(drop = True, inplace = True)
    return(new_df)

#Creating a function that groups by, counts, creates a new column from the index, drops the index and changes the column names
def groupby_2_count(df, groupby_column1, groupby_column2, count_column): 
    new_df = pd.DataFrame(df.groupby([groupby_column1, groupby_column2 ])[count_column].count())
    new_df.columns = ["count"]
    new_df[groupby_column1] = new_df.index.get_level_values(0)
    new_df[groupby_column2] = new_df.index.get_level_values(1)
    new_df.reset_index(drop = True, inplace = True)
    return(new_df)

#This will calculate the exponential moving average of the columns we want
#exponential moving averages give more weight to the most recent data and less weight to older data 
def exp_moving_avg(d):
    d["exp_moving_avg"] = d["score"].ewm(span=40,adjust=False).mean()
    exp_moving_avg = list(d["exp_moving_avg"])
    #Adding a 0 to the first entry to exp_moving_avg
    exp_moving_avg = [0] + exp_moving_avg
    #Removing the last entry in the list 
    exp_moving_avg.pop()
    #Creating a column named exp_moving_avg with the results 
    d["exp_moving_avg"] = exp_moving_avg
    return(exp_moving_avg)

#This will calculate the cumulative moving average
def cumulative_moving_avg(d): 
    d["moving_avg"] = d.expanding(min_periods = 1).mean()
    moving_avg = list(d["moving_avg"])
    #Adding a 0 to the first entry to moving avg
    cumulative_moving_avg = [0] + moving_avg
    #Removing the last entry in the list 
    cumulative_moving_avg.pop()
    return(cumulative_moving_avg)

#This will get the list of all of the entries in the column that we are interested in for calculating the averages
def getting_list_of_entries(df, column_interested_in): 
    avg_people = pd.DataFrame(df.groupby([column_interested_in, "released"])["score"].mean())
    avg_column_scores = pd.DataFrame()
    column_interested = list(df[column_interested_in].unique())
    return([avg_people, column_interested])

# Going to use matplotlib for plotting... 
# To create a plot we followed the following formula: 
# df.plot(x-axis, y-axis, kind = type of plot, color = [(we specified colors to use here)], legend = False (we did not 
# want a legend displayed), title = "Title") then we added a ylabel with plt.ylabel("Type label here") and an x label 
# with plt.xlabel("type label here"). Finally, we wanted to change the direction of the xtick names from a 90 degree angle 
# to no angle with plt.xticks(rotation = rotation angle desired)
def bar_graph_count(df, x_column, y_column, title):
    g = df.plot(x_column, y_column, kind = "bar", legend = False, title = title)
    g = plt.ylabel(y_column)
    g = plt.xlabel(x_column)
    return(g)

#This will make a df for our moving averages that we are calculating 
def making_df(people_df, column_interested_in, released, person, cumulative_avg, exp_avg): 
    df_2 = pd.DataFrame({column_interested_in: person, "released": released, "cumulative_mean": cumulative_avg, "exp_mean": exp_avg})
    return(df_2)

#This includes the functions above, and will calculate the exponential and cumulative moving averages for which ever 
#column we specify and return a df will the column interested in, released, cumulative_mean, exp_mean
def calculating_moving_avg(df, column_interested_in):
    people_df = pd.DataFrame()
    people = getting_list_of_entries(df, column_interested_in)
    cumulative_avg = []
    avg_people = people[0]
    avg_people
    for person in people[1]: 
        d = avg_people.groupby(column_interested_in).get_group(person)
        cumulative_avg = cumulative_moving_avg(d)
        exp_avg = exp_moving_avg(d)
        d.reset_index(inplace = True)
        released = d["released"]
        df = pd.DataFrame({column_interested_in: person, "released": released, "cumulative_mean_"+column_interested_in : cumulative_avg, "exp_mean_"+column_interested_in: exp_avg})
        people_df = people_df.append(df)
    return(people_df)

#Confusion Matrix Graph Function 
def confusion_matrix_graph (cm, accuracy_label, type_of_df): 
    g = plt.figure(figsize=(2,2))
    g = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r', cbar = False);
    g = plt.ylabel('Actual');
    g = plt.xlabel('Predicted');
    g = all_sample_title = type_of_df +' Accuracy Score: {0}'.format(round(accuracy_label, 4))
    g = plt.title(all_sample_title, size = 12);
    return(g)
In [4]:
#reading in the movies.csv file from Kaggle 
movies = pd.read_csv("movies.csv", encoding = "ISO-8859-1")
In [5]:
len(movies)
Out[5]:
6820
In [6]:
#Looking at the head of the dataframe 
movies.head()
Out[6]:
budget company country director genre gross name rating released runtime score star votes writer year
0 8000000 Columbia Pictures Corporation USA Rob Reiner Adventure 52287414 Stand by Me R 8/22/1986 89 8.1 Wil Wheaton 299174 Stephen King 1986
1 6000000 Paramount Pictures USA John Hughes Comedy 70136369 Ferris Bueller's Day Off PG-13 6/11/1986 103 7.8 Matthew Broderick 264740 John Hughes 1986
2 15000000 Paramount Pictures USA Tony Scott Action 179800601 Top Gun PG 5/16/1986 110 6.9 Tom Cruise 236909 Jim Cash 1986
3 18500000 Twentieth Century Fox Film Corporation USA James Cameron Action 85160248 Aliens R 7/18/1986 137 8.4 Sigourney Weaver 540152 James Cameron 1986
4 9000000 Walt Disney Pictures USA Randal Kleiser Adventure 18564613 Flight of the Navigator PG 8/1/1986 90 6.9 Joey Cramer 36636 Mark H. Baker 1986
In [7]:
#Getting the shape of the df 
movies.shape
#We currently have 6,820 rows and 15 columns 
Out[7]:
(6820, 15)
In [8]:
#Checking to see if we have any missing values... It shows that we do not. 
movies.isnull().sum()
Out[8]:
budget      0
company     0
country     0
director    0
genre       0
gross       0
name        0
rating      0
released    0
runtime     0
score       0
star        0
votes       0
writer      0
year        0
dtype: int64
In [9]:
# If we had missing values we would do the following
#We are dropping those rows with the following code
#movies.dropna(inplace = True)
In [10]:
#We are removing any movie that has a budget of 0, because for our machine learning, we will want to predict profit. 
# movies in our df that do not contain gross or budget will not be useful for this. 
movies = movies[movies["budget"] != 0]
#We are removing any movie with a gross of 0 
movies = movies[movies["gross"] != 0]
# movies = movies[movies["production_companies"] != "[]"]
# movies = movies[movies["genres"] !=  "[]"]
len(movies)
Out[10]:
4638
In [11]:
#Checking data types of the columns 
movies.dtypes
#Once we are done cleaning the data we are going to change the data types of: company, director, genre, rating, released
#star, writer, and potentially year. If we change them now, when we clean the df and removed rows, the old categories 
#remain, and still show as possible categories. 
Out[11]:
budget        int64
company      object
country      object
director     object
genre        object
gross         int64
name         object
rating       object
released     object
runtime       int64
score       float64
star         object
votes         int64
writer       object
year          int64
dtype: object
In [12]:
#Need to change the following to date 
    #released,year
movies["released"] = pd.to_datetime(movies["released"])
In [13]:
#Separating the month, day and year into their own columns in case we would like to analyze based on month, day or year
movies["month"], movies["day"] = movies["released"].dt.month, movies["released"].dt.day
In [14]:
#Checking the data types of the columns and making sure the new columns were added 
movies.dtypes
Out[14]:
budget               int64
company             object
country             object
director            object
genre               object
gross                int64
name                object
rating              object
released    datetime64[ns]
runtime              int64
score              float64
star                object
votes                int64
writer              object
year                 int64
month                int64
day                  int64
dtype: object
In [15]:
cat = list(range(1,13))
#Changing the month data type from int to ordered category 
movies["month"] = pd.Categorical(movies["month"], ordered = True, categories = cat)
In [16]:
#Making sure it shows as an ordered factor 
movies.month.dtype
Out[16]:
CategoricalDtype(categories=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], ordered=True)
In [17]:
movies.dtypes
Out[17]:
budget               int64
company             object
country             object
director            object
genre               object
gross                int64
name                object
rating              object
released    datetime64[ns]
runtime              int64
score              float64
star                object
votes                int64
writer              object
year                 int64
month             category
day                  int64
dtype: object
In [18]:
#Getting a list of a the different ratings in our df 
movies["rating"].unique()
Out[18]:
array(['R', 'PG-13', 'PG', 'UNRATED', 'G', 'NC-17', 'NOT RATED',
       'Not specified'], dtype=object)
In [19]:
#UNRATED, not specified AND NOT RATED mean the same thing, therefore we are going to change all not rated, not specified entries to unrated 
movies["rating"] = movies["rating"].replace(["NOT RATED", "Not specified"], "UNRATED")
In [20]:
#Checking to make sure that worked: 
movies["rating"].unique()
Out[20]:
array(['R', 'PG-13', 'PG', 'UNRATED', 'G', 'NC-17'], dtype=object)
In [21]:
#Changing rating to an ordered factor
#Creating the order that we would like for the ordered factor
cat = ["UNRATED", "G", "PG", "PG-13", "R", "NC-17"]
#Changing to ordered factor 
movies["rating"] = pd.Categorical(movies["rating"], ordered = True, categories = cat)
In [22]:
#Checking to see if it worked 
movies.rating.dtype
Out[22]:
CategoricalDtype(categories=['UNRATED', 'G', 'PG', 'PG-13', 'R', 'NC-17'], ordered=True)
In [23]:
#We want to be able to look at the profit for each movie... Therefore we are creating a 
#profit column which is gross - budget 

movies["profit"] = movies["gross"] - movies["budget"]
In [24]:
#Creating a percent profit column to have a normalized way to compare profits. 
#percent_profit = profit/budget*100 
movies["percent_profit"] = movies["profit"]/movies["budget"]*100
In [25]:
movies.head()
Out[25]:
budget company country director genre gross name rating released runtime score star votes writer year month day profit percent_profit
0 8000000 Columbia Pictures Corporation USA Rob Reiner Adventure 52287414 Stand by Me R 1986-08-22 89 8.1 Wil Wheaton 299174 Stephen King 1986 8 22 44287414 553.592675
1 6000000 Paramount Pictures USA John Hughes Comedy 70136369 Ferris Bueller's Day Off PG-13 1986-06-11 103 7.8 Matthew Broderick 264740 John Hughes 1986 6 11 64136369 1068.939483
2 15000000 Paramount Pictures USA Tony Scott Action 179800601 Top Gun PG 1986-05-16 110 6.9 Tom Cruise 236909 Jim Cash 1986 5 16 164800601 1098.670673
3 18500000 Twentieth Century Fox Film Corporation USA James Cameron Action 85160248 Aliens R 1986-07-18 137 8.4 Sigourney Weaver 540152 James Cameron 1986 7 18 66660248 360.325665
4 9000000 Walt Disney Pictures USA Randal Kleiser Adventure 18564613 Flight of the Navigator PG 1986-08-01 90 6.9 Joey Cramer 36636 Mark H. Baker 1986 8 1 9564613 106.273478
In [26]:
#Directors
#Aggregating a moving average column and calculating the mean average imdb score for each actor; by calculating the 
#mean imdb scores for all actors but for only the movies prior to the movie we are calculting the mean for. 
directors_avg = calculating_moving_avg(movies, "director")
#Writers: 
writers_avg = calculating_moving_avg(movies, "writer")
#actors: 
stars_avg = calculating_moving_avg(movies, "star")
#company: 
companies_avg = calculating_moving_avg(movies, "company")
In [27]:
#We are going to use our left_merge_2_conditions function: 
#Inputs: df1, df2, column to merge on 1 and column to merge on 2 
movies = left_merge_2_conditions(movies, directors_avg, "director", "released")
movies = left_merge_2_conditions(movies, writers_avg, "writer", "released")
movies = left_merge_2_conditions(movies, stars_avg, "star", "released")
movies = left_merge_2_conditions(movies, companies_avg, "company", "released")
In [28]:
movies.head()
Out[28]:
budget company country director genre gross name rating released runtime ... profit percent_profit cumulative_mean_director exp_mean_director cumulative_mean_writer exp_mean_writer cumulative_mean_star exp_mean_star cumulative_mean_company exp_mean_company
0 8000000 Columbia Pictures Corporation USA Rob Reiner Adventure 52287414 Stand by Me R 1986-08-22 89 ... 44287414 553.592675 0.0 0.0 5.4 5.4 0.0 0.0 5.95 6.265854
1 6000000 Paramount Pictures USA John Hughes Comedy 70136369 Ferris Bueller's Day Off PG-13 1986-06-11 103 ... 64136369 1068.939483 0.0 0.0 6.8 6.8 0.0 0.0 6.10 6.641843
2 15000000 Paramount Pictures USA Tony Scott Action 179800601 Top Gun PG 1986-05-16 110 ... 164800601 1098.670673 0.0 0.0 0.0 0.0 0.0 0.0 5.90 6.628604
3 18500000 Twentieth Century Fox Film Corporation USA James Cameron Action 85160248 Aliens R 1986-07-18 137 ... 66660248 360.325665 0.0 0.0 0.0 0.0 0.0 0.0 7.05 6.824390
4 9000000 Walt Disney Pictures USA Randal Kleiser Adventure 18564613 Flight of the Navigator PG 1986-08-01 90 ... 9564613 106.273478 0.0 0.0 0.0 0.0 0.0 0.0 7.20 7.200000

5 rows × 27 columns

In [29]:
#Looking to see what happens if we remove all the movies with a 0 for exp_mean_director and exp_mean_star
movies = movies[movies["exp_mean_director"] != 0]
movies = movies[movies["exp_mean_star"] != 0]
movies = movies[movies["exp_mean_writer"] != 0]
movies = movies[movies["exp_mean_company"] != 0]
In [30]:
len(movies) #We still have 883 movies in our df
Out[30]:
883
In [31]:
movies.head()
Out[31]:
budget company country director genre gross name rating released runtime ... profit percent_profit cumulative_mean_director exp_mean_director cumulative_mean_writer exp_mean_writer cumulative_mean_star exp_mean_star cumulative_mean_company exp_mean_company
110 30000000 Paramount Pictures USA John Hughes Comedy 49530280 Planes, Trains & Automobiles R 1987-11-25 93 ... 19530280 65.100933 7.8 7.8 7.3 6.84878 6.5 6.409756 6.428571 6.633831
116 28000000 Paramount Pictures USA Tony Scott Action 153665000 Beverly Hills Cop II R 1987-05-20 100 ... 125665000 448.803571 6.9 6.9 6.2 6.20000 5.9 5.900000 6.245455 6.557347
139 16000000 Orion Pictures USA Woody Allen Comedy 14792779 Radio Days PG 1987-01-30 88 ... -1207221 -7.545131 8.0 8.0 8.0 8.00000 8.0 8.000000 7.100000 6.755443
151 350000 Cannon Group USA Sam Firstenberg Action 4000000 American Ninja 2: The Confrontation R 1987-05-01 90 ... 3650000 1042.857143 5.9 5.9 5.7 5.70000 5.9 5.900000 5.350000 5.034146
213 6000000 Paramount Pictures Canada Jonathan Kaplan Drama 32069318 Acusados R 1988-10-14 111 ... 26069318 434.488633 6.1 6.1 6.7 6.70000 6.2 6.200000 6.377778 6.546875

5 rows × 27 columns

In [32]:
#Creating an aggregated column for the avg writer, director, company, actor cumulative mean 
movies["cumulative_mean_avg"] = (movies["cumulative_mean_writer"] + movies["cumulative_mean_director"] + 
                              movies["cumulative_mean_company"] + movies["cumulative_mean_star"])/4
In [33]:
# Creating an aggregated column for the avg writer, director, company, 
# and actor exponential mean
movies["exp_mean_avg"] = (movies["exp_mean_writer"] + movies["exp_mean_director"] + 
                              movies["exp_mean_company"] + movies["exp_mean_star"])/4
In [34]:
movies.head()
Out[34]:
budget company country director genre gross name rating released runtime ... cumulative_mean_director exp_mean_director cumulative_mean_writer exp_mean_writer cumulative_mean_star exp_mean_star cumulative_mean_company exp_mean_company cumulative_mean_avg exp_mean_avg
110 30000000 Paramount Pictures USA John Hughes Comedy 49530280 Planes, Trains & Automobiles R 1987-11-25 93 ... 7.8 7.8 7.3 6.84878 6.5 6.409756 6.428571 6.633831 7.007143 6.923092
116 28000000 Paramount Pictures USA Tony Scott Action 153665000 Beverly Hills Cop II R 1987-05-20 100 ... 6.9 6.9 6.2 6.20000 5.9 5.900000 6.245455 6.557347 6.311364 6.389337
139 16000000 Orion Pictures USA Woody Allen Comedy 14792779 Radio Days PG 1987-01-30 88 ... 8.0 8.0 8.0 8.00000 8.0 8.000000 7.100000 6.755443 7.775000 7.688861
151 350000 Cannon Group USA Sam Firstenberg Action 4000000 American Ninja 2: The Confrontation R 1987-05-01 90 ... 5.9 5.9 5.7 5.70000 5.9 5.900000 5.350000 5.034146 5.712500 5.633537
213 6000000 Paramount Pictures Canada Jonathan Kaplan Drama 32069318 Acusados R 1988-10-14 111 ... 6.1 6.1 6.7 6.70000 6.2 6.200000 6.377778 6.546875 6.344444 6.386719

5 rows × 29 columns

In [35]:
#What is the breakdown of genre in our df?

#Getting the count of movies for each genre in our df and saving it as a pandas df. 
#We are grouping by genre and then getting the count of the genre column in each group by 
#we could have used any column to get the count of... 
#We are using the groupby_count function that takes the following arguments (df, groupby_column, count_column)
movies_genre = groupby_count(movies, "genre", "genre")
In [36]:
movies_genre
Out[36]:
count genre
0 301 Action
1 54 Adventure
2 33 Animation
3 38 Biography
4 233 Comedy
5 69 Crime
6 116 Drama
7 4 Fantasy
8 27 Horror
9 7 Mystery
10 1 Romance
In [37]:
#Sorting the df, so the bar graph will be in descending order
movies_genre.sort_values(['count'], ascending=[False], inplace = True)
#Creating a graph of the movies_genre df using our bar_graph_count function. It takes the following inputs: 
# df, x-column, y_column, and title 
bar_graph_count(movies_genre, "genre", "count", "Visualization of Number of Movies Per Genre")
Out[37]:
Text(0.5, 0, 'genre')
In [38]:
#Creating a data frame of the movies star count 
movies_star = groupby_count(movies, "star", "genre") 
In [39]:
movies_star.head()
Out[39]:
count star
0 2 Aaron Eckhart
1 2 Aaron Taylor-Johnson
2 12 Adam Sandler
3 3 Al Pacino
4 1 Albert Brooks
In [40]:
# #Creating a subset of the movies_star df that contains only stars that have 2+ movies in our df 
# movies_star = movies_star[movies_star["count"] > 1]
# #Creating a list of the stars that are in our subsetted df
# movies_star_list = list(movies_star["star"])
# movies_star_list
# #Subsetting our movies df to include only stars who are listed in our movies_star_list
# movies = movies[movies.star.isin(movies_star_list)]
In [41]:
movies_star.describe()
#The majority of our 356 stars.  Only 25% of our stars in 3 or more movies. 
# We have 649 stars in our newly reduced df 
Out[41]:
count
count 356.000000
mean 2.480337
std 2.290742
min 1.000000
25% 1.000000
50% 2.000000
75% 3.000000
max 12.000000
In [42]:
# How many movies have each star in our df starred in? 

#Looking to see how many stars have starred in 1, 2, 3, 4+ movies 
movies_star = groupby_count(movies, "star", "genre")
In [43]:
#Changing the column names for our movies_star
movies_star.columns = ["number_of_movies", "star"]
movies_star = groupby_count(movies_star, "number_of_movies", "star")
In [44]:
movies_star
Out[44]:
count number_of_movies
0 170 1
1 78 2
2 40 3
3 19 4
4 9 5
5 17 6
6 5 7
7 5 8
8 3 9
9 3 10
10 2 11
11 5 12
In [45]:
#Changing the column names 
movies_star.columns = ["number_of_stars", "number_of_movies"]
In [46]:
#Sorting the df, so the bar graph will be in descending order
movies_star.sort_values(['number_of_movies'], ascending=[True], inplace = True)
#Creating a graph of the movies_star df using our bar_graph_count function. It takes the following inputs: 
# df, x-column, y_column, and title 
bar_graph_count(movies_star, "number_of_movies", "number_of_stars", "Visualization of the The Number of Movies each Star has in our DF")
Out[46]:
Text(0.5, 0, 'number_of_movies')
In [47]:
#Creating a data frame of the movies director count 
movies_director = groupby_count(movies, "director", "genre")
movies_director.columns = ["director_count", "director"]
In [48]:
# #Creating a subset of the movies_director df that contains only directors that have 2 movies in our df 
# movies_director = movies_director[movies_director["director_count"] > 1]
# #Creating a list of the stars that are in our subsetted df
# movies_director_list = list(movies_director["director"])
# movies_director_list
# #Subsetting our movies df to include only stars who are listed in our movies_star_list
# movies = movies[movies.director.isin(movies_director_list)]
In [49]:
# How many movies have each director in our df produced? 

#Creating a new groupby to see how many directors we have that have produced 1, 2, 3, 4... movies 
movies_director = groupby_count(movies_director, "director_count", "director")
In [50]:
#Getting a tally of the number of directors that have the same number of movies 
movies_director.columns = ["number_of_movies", "director_count"]
movies_director
Out[50]:
number_of_movies director_count
0 257 1
1 94 2
2 48 3
3 22 4
4 15 5
5 11 6
6 4 7
7 2 9
8 1 19
In [51]:
#Creating a graph of the movies_director df 
bar_graph_count(movies_director, "director_count", "number_of_movies", "Visualization of the The Number of Movies each Director has in our DF")
Out[51]:
Text(0.5, 0, 'director_count')
In [52]:
# How many movies have each company in our df produced? 
#Creating a new groupby to see how many company we have that have produced 1, 2, 3, 4... movies 
movies_company = groupby_count(movies, "company", "star")
movies_company.columns = ["number_of_movies", "company"]
movies_company = groupby_count(movies_company, "number_of_movies", "company")
movies_company.columns = ["number_of_companies", "number_of_movies"]
movies_company
Out[52]:
number_of_companies number_of_movies
0 99 1
1 25 2
2 11 3
3 6 4
4 2 5
5 1 6
6 2 7
7 1 8
8 2 9
9 2 11
10 2 12
11 2 13
12 1 14
13 1 15
14 2 17
15 1 18
16 2 21
17 1 28
18 1 34
19 2 50
20 1 72
21 1 90
22 1 102
In [53]:
#Creating a graph of the movies_company df 
bar_graph_count(movies_company, "number_of_movies", "number_of_companies", "Visualization of the The Number of Movies each Company has in our DF")
Out[53]:
Text(0.5, 0, 'number_of_movies')
In [54]:
#How are the movies in our df distributed by year?

#Looking to see how the movies are distributed by year 
movies_year = groupby_count(movies, "year", "star")
movies_year
Out[54]:
count year
0 4 1987
1 3 1988
2 5 1989
3 5 1990
4 10 1991
5 16 1992
6 18 1993
7 21 1994
8 17 1995
9 18 1996
10 18 1997
11 20 1998
12 26 1999
13 25 2000
14 30 2001
15 35 2002
16 32 2003
17 45 2004
18 38 2005
19 48 2006
20 42 2007
21 33 2008
22 47 2009
23 49 2010
24 51 2011
25 43 2012
26 50 2013
27 38 2014
28 42 2015
29 54 2016
In [55]:
#Creating a graph of the movies_year df 
bar_graph_count(movies_year, "year", "count", "Visualization of Number of Movies Per Year")
Out[55]:
Text(0.5, 0, 'year')
In [56]:
# What is the breakdown of the number of movies per genre per year?

#Looking at the number of movies per genre per year 
#Looking to see how the movies are distributed by year 
#We are using the groupby_2 function that takes the following arguements: 
#(df, groupby_column1, groupby_column2, agg_column)
movies_year_genre = groupby_2_count(movies, "year", "genre", "star")
In [57]:
movies_year_genre.head()
Out[57]:
count year genre
0 2 1987 Action
1 2 1987 Comedy
2 2 1988 Comedy
3 1 1988 Drama
4 2 1989 Action
In [58]:
#To visualize this, we are going to use a grouped bar graph. Our bar_graph_count function does not apply to this 
#situation... We are going to use the pivot function 
# Attempting to graph this data using a grouped bar chart: 
# formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"], 
# title = "you can set the title of your graph here")
movies_year_genre.pivot("year", "genre", "count").plot(kind="bar", title = "Visualization of Genre Breakdown by Year")
#Creating a y axis label 
plt.ylabel("Number of Movies")
#Changing the x axis label 
plt.xlabel("Year")
#Changing the label to display male and female instead of 0 and 1
plt.legend(loc = "lower center", bbox_to_anchor = (.5, -.8), ncol = 4, title = "Genre")

#This was not very helpful, we are going to subset the data by 5 year chunks and regraph
Out[58]:
<matplotlib.legend.Legend at 0x27e096ac8d0>
In [59]:
'''Come back here if time prevails to do what is said above'''
Out[59]:
'Come back here if time prevails to do what is said above'
In [60]:
#What are the most prolific months for movies to be released in our dataset?

#Looking at month 
movies_month = groupby_count(movies, "month", "star")
movies_month
Out[60]:
count month
0 63 1
1 47 2
2 55 3
3 58 4
4 81 5
5 79 6
6 78 7
7 81 8
8 57 9
9 86 10
10 96 11
11 102 12
In [61]:
#Creating a new column called month with the names of the month of the year, since the df is ordered, we know that 
#the months of the year can be added this way 
movies_month["month"] = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"]
In [62]:
#Changing the data type of the month column to ordered categorical 
cat = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"]
movies_month["month"] = pd.Categorical(movies_month["month"], ordered = True, categories = cat)
In [63]:
movies_month.month.dtype
Out[63]:
CategoricalDtype(categories=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
                  'Sept', 'Oct', 'Nov', 'Dec'],
                 ordered=True)
In [64]:
movies_month
Out[64]:
count month
0 63 Jan
1 47 Feb
2 55 Mar
3 58 Apr
4 81 May
5 79 Jun
6 78 Jul
7 81 Aug
8 57 Sept
9 86 Oct
10 96 Nov
11 102 Dec
In [65]:
bar_graph_count(movies_month, "month", "count", "Visualization of Number of Movies by Month")
Out[65]:
Text(0.5, 0, 'month')
In [66]:
#Descritizing the df based on all of the numeric columns: 
#Creating a new df name movies_descritized 
movies_discretized = movies.copy()
In [67]:
movies_discretized.columns
Out[67]:
Index(['budget', 'company', 'country', 'director', 'genre', 'gross', 'name',
       'rating', 'released', 'runtime', 'score', 'star', 'votes', 'writer',
       'year', 'month', 'day', 'profit', 'percent_profit',
       'cumulative_mean_director', 'exp_mean_director',
       'cumulative_mean_writer', 'exp_mean_writer', 'cumulative_mean_star',
       'exp_mean_star', 'cumulative_mean_company', 'exp_mean_company',
       'cumulative_mean_avg', 'exp_mean_avg'],
      dtype='object')
In [68]:
# We are going to descritize our data based on the quartiles. The categories are: 
# extremely_low, low, high, extremely_high
# We are using our quartile_discretize function that takes the following arguments: 
#(df, column, category)
categories = ["extremely_low", "low", "high", "extremely_high"]
movies_discretized["budget"] = quartile_discretize(movies_discretized, "budget", categories)
In [69]:
#Checking to make sure it worked 
movies_discretized.budget.dtype
Out[69]:
CategoricalDtype(categories=['extremely_low', 'low', 'high', 'extremely_high'], ordered=True)
In [70]:
#Gross: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["gross"] = quartile_discretize(movies_discretized, "gross", categories)
In [71]:
#Checking to see that it worked 
movies_discretized.gross.dtype
Out[71]:
CategoricalDtype(categories=['extremely_low', 'low', 'high', 'extremely_high'], ordered=True)
In [72]:
#Score: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["score"] = quartile_discretize(movies_discretized, "score", categories)
In [73]:
#Checking to see that it worked 
movies_discretized.score.dtype
Out[73]:
CategoricalDtype(categories=['extremely_low', 'low', 'high', 'extremely_high'], ordered=True)
In [74]:
#Votes: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["votes"] = quartile_discretize(movies_discretized, "votes", categories)
In [75]:
movies_discretized.votes.dtype
Out[75]:
CategoricalDtype(categories=['extremely_low', 'low', 'high', 'extremely_high'], ordered=True)
In [76]:
#writer_mean: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["cumulative_mean_writer"] = quartile_discretize(movies_discretized,"cumulative_mean_writer", categories)
movies_discretized["exp_mean_writer"] = quartile_discretize(movies_discretized,"exp_mean_writer", categories)
movies_discretized["cumulative_mean_director"] = quartile_discretize(movies_discretized, "cumulative_mean_director", categories)
movies_discretized["exp_mean_director"] = quartile_discretize(movies_discretized,"exp_mean_director", categories)
movies_discretized["cumulative_mean_star"] = quartile_discretize(movies_discretized, "cumulative_mean_star", categories)
movies_discretized["exp_mean_star"] = quartile_discretize(movies_discretized, "exp_mean_star", categories)
movies_discretized["cumulative_mean_company"] = quartile_discretize(movies_discretized, "cumulative_mean_company", categories)
movies_discretized["exp_mean_company"] = quartile_discretize(movies_discretized, "exp_mean_company", categories)
In [77]:
#We are creating new label categories 
categories = ["extremely_short", "short", "long", "extremely_long"]
movies_discretized["runtime"] = quartile_discretize(movies_discretized, "runtime", categories)
In [78]:
#Checking to see if that worked 
movies_discretized.runtime.dtype
Out[78]:
CategoricalDtype(categories=['extremely_short', 'short', 'long', 'extremely_long'], ordered=True)
In [79]:
movies_discretized.percent_profit.describe()
Out[79]:
count     883.000000
mean       47.168989
std       171.138287
min       -99.976634
25%       -44.219378
50%         6.505111
75%        80.332174
max      1707.844444
Name: percent_profit, dtype: float64
In [80]:
#We are creating new label categories ; Discretized Percent Profit 
#We cannot use our function on this, because we are not discretizing by quartiles
categories = ["negative", "low", "high", "extremely_high"]
movies_discretized["percent_profit"] = pd.cut(movies_discretized["percent_profit"], [-100, 0, 50, 150, 999999], labels = categories)
In [81]:
#Checking to see if it worked 
movies_discretized.percent_profit.dtype
Out[81]:
CategoricalDtype(categories=['negative', 'low', 'high', 'extremely_high'], ordered=True)
In [82]:
movies_discretized.profit.describe()
Out[82]:
count    8.830000e+02
mean     1.691477e+07
std      6.503097e+07
min     -1.286123e+08
25%     -1.660067e+07
50%      2.697999e+06
75%      3.387665e+07
max      5.022706e+08
Name: profit, dtype: float64
In [83]:
#Profit
#We cannot use our function on this, because we are not discretizing by quartiles
movies_discretized["profit"] = pd.cut(movies_discretized["profit"], [-9999999999, 0, 1000000, 50000000, 999999999], labels = categories)
In [84]:
movies_discretized.head()
Out[84]:
budget company country director genre gross name rating released runtime ... cumulative_mean_director exp_mean_director cumulative_mean_writer exp_mean_writer cumulative_mean_star exp_mean_star cumulative_mean_company exp_mean_company cumulative_mean_avg exp_mean_avg
110 low Paramount Pictures USA John Hughes Comedy low Planes, Trains & Automobiles R 1987-11-25 extremely_short ... extremely_high extremely_high extremely_high high low low high extremely_high 7.007143 6.923092
116 low Paramount Pictures USA Tony Scott Action extremely_high Beverly Hills Cop II R 1987-05-20 extremely_short ... high high low low extremely_low extremely_low low high 6.311364 6.389337
139 extremely_low Orion Pictures USA Woody Allen Comedy extremely_low Radio Days PG 1987-01-30 extremely_short ... extremely_high extremely_high extremely_high extremely_high extremely_high extremely_high extremely_high extremely_high 7.775000 7.688861
151 extremely_low Cannon Group USA Sam Firstenberg Action extremely_low American Ninja 2: The Confrontation R 1987-05-01 extremely_short ... extremely_low extremely_low extremely_low extremely_low extremely_low extremely_low extremely_low extremely_low 5.712500 5.633537
213 extremely_low Paramount Pictures Canada Jonathan Kaplan Drama low Acusados R 1988-10-14 long ... extremely_low extremely_low high low extremely_low extremely_low high high 6.344444 6.386719

5 rows × 29 columns

In [85]:
#We are setting new categories for the day column by creating a new column for week 
# week_1 is the first 7 days of the month, week_2 is days 8 - 14, week_3 is days 15 - 21, and week_4 are the 
# rest of the days
categories = ["week_1", "week_2", "week_3", "week_4"]

movies_discretized["week"] = pd.cut(movies_discretized["day"], [0, 8, 15, 22, 32], labels = categories)
In [86]:
movies_discretized.week.dtype
Out[86]:
CategoricalDtype(categories=['week_1', 'week_2', 'week_3', 'week_4'], ordered=True)
In [87]:
#Looking at the relationship between genre and percent profit 
movies_discretized_genre_pp = groupby_2_count(movies_discretized, "genre", "percent_profit", "score")
In [88]:
movies_discretized_genre_pp.head()
Out[88]:
count genre percent_profit
0 150 Action negative
1 60 Action low
2 62 Action high
3 29 Action extremely_high
4 16 Adventure negative
In [89]:
#Now we are getting the sum of each genre category... We do not have a function for sum... we could go back and rework 
#our function.
movies_discretized_genre_pp.groupby("genre")["count"].sum()
Out[89]:
genre
Action       301
Adventure     54
Animation     33
Biography     38
Comedy       233
Crime         69
Drama        116
Fantasy        4
Horror        27
Mystery        7
Romance        1
Name: count, dtype: int64
In [90]:
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre"]
In [91]:
'''We ultimately want a column that contains the total counts for each genre group. We are probably doing this in 
a roundabout way, but as I am extremely new to python this is the best way I can think of doing it. We are going to create 
a new column that replicates the genre column called genre_count and then we will use the replace function to 
replace the genre names with their total count '''
#First, replicating the income level column in a column named budget_category_count
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre"] 
#Now replacing the income level with the total count for each income level 
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Action"], 301 )
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Adventure"], 54)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Animation"], 33)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Biography"], 38)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Comedy"], 233)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Crime"], 69)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Drama"], 116)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Fantasy"], 4)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Horror"], 27)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Mystery"], 7)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Romance"], 1)
In [92]:
movies_discretized_genre_pp.head()
Out[92]:
count genre percent_profit genre_count
0 150 Action negative 301
1 60 Action low 301
2 62 Action high 301
3 29 Action extremely_high 301
4 16 Adventure negative 54
In [93]:
movies_discretized_genre_pp["genre_count"] = pd.to_numeric(movies_discretized_genre_pp["genre_count"])
In [94]:
#Okay, we are one step closer... Now, we need to create a column that takes the counts/genre_counts * 100 
movies_discretized_genre_pp["percent"] = movies_discretized_genre_pp["count"]/movies_discretized_genre_pp["genre_count"] *100
In [95]:
movies_discretized_genre_pp.head()
Out[95]:
count genre percent_profit genre_count percent
0 150 Action negative 301 49.833887
1 60 Action low 301 19.933555
2 62 Action high 301 20.598007
3 29 Action extremely_high 301 9.634551
4 16 Adventure negative 54 29.629630
In [96]:
# Attempting to graph this data using a grouped bar chart: 
# formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"], 
# title = "you can set the title of your graph here")
graph = movies_discretized_genre_pp.pivot("genre", "percent_profit", 
                                                "percent").plot(kind="bar", color = ["crimson", "salmon", "palegreen", "darkgreen"],
                                                               title = "Percent of Percent Profit to Genre Category")
#Changing the y label of our graph to Percent
plt.ylabel("Percent")
#Changing the x axis label of our graph to Budget Category 
plt.xlabel("Genre")
#How to change the tick labels (we ended up not needing this, but want to keep for future reference)
#plt.Axes.set_xticklabels(graph, labels = ['extremely low', 'low', 'high', 'extremely high'])
#moving the legend position to underneath the graph, also setting it to have 4 columns so the legend is in a 
#straight single line and adding a legend title
plt.legend( loc = "lower center", bbox_to_anchor = (.5, -.6), ncol = 4, title = "Percent Makeup of Genre Category")
Out[96]:
<matplotlib.legend.Legend at 0x27e0abb4c18>
In [97]:
##############################################################################################################
# We are going to implement machine learning to see if we can predict either tmdb score or percent profit
##############################################################################################################
#
##############################################################################################################
# We are going to create an all numeric df, that does not contain the profit, percent profit, votes, and gross. 
# This information would give us an unfair look when trying to predict either score or percent profit, as these 
# metrics would not be available prior to the release of the movie. 
################################################################################################################
# Potential Machine Learning algorithms we can use on our all numeric df: 
# logistic regression
# Naive Bayes 
# k-means 
# Decision Tree 
# knn 
# svm
################################################################################################################
#
#
################################################################################################################
# We are also going to create an all discretized df
################################################################################################################
################################################################################################################
# Potential Machine Learning algorithms we can use on our discretized df: 
# Decision Tree
# Association Rule Mining 
# Random Forest 
################################################################################################################
In [98]:
#################################################################
#Naive Bayes 
#**All Numerica Data *** 
#################################################################
In [99]:
#Creating testing and training datasets comprised of all numeric data for score
test_train_movies = movies.copy()
In [100]:
test_train_movies_score = test_train_movies.copy()
In [101]:
test_train_movies_score.score.describe()
Out[101]:
count    883.000000
mean       6.461608
std        0.946882
min        2.600000
25%        5.900000
50%        6.500000
75%        7.100000
max        9.000000
Name: score, dtype: float64
In [102]:
#We believe that people will be most interested in if they will get a high imdb score and not if they will have an 
#average or low score... Due to this, we have decided to discretize score in 2 categories: high and not_high. 
#We have decided that a high score is 7+. 
test_train_movies_score = test_train_movies_score.copy()
categories = ["not_high", "high"]
# not_high = 0 - 7 
# high = 7 - 10
test_train_movies_score["score"] = pd.cut(test_train_movies_score["score"], [0, 7, 10], labels = categories)
In [103]:
#Getting a count of the number of movies that are classified as high and not high in our df. 
test_train_movies_score_count = test_train_movies_score.groupby("score")["score"].count()
test_train_movies_score_count
Out[103]:
score
not_high    641
high        242
Name: score, dtype: int64
In [104]:
# We are going to create a testing and training df that contains 242 not_high entries and 242 high entries 
#First we are going to subset the not_high scores and the high scores 
not_high = test_train_movies_score[test_train_movies_score["score"] == "not_high"]
test_train_movies_score = test_train_movies_score[test_train_movies_score["score"] == "high"]
#Getting the length to make sure that we have 641 movies
len(not_high)
Out[104]:
641
In [105]:
#Checking to make sure that the test_train_movies_score is equal to the 242 high movies 
len(test_train_movies_score)
Out[105]:
242
In [106]:
#Now getting a random sample of 242 entries in the not_high df and setting the seed to 123 to reproduce the results 
not_high = not_high.sample(n = 242, random_state = 123)
#Getting the length to make sure that it worked 
len(not_high)
Out[106]:
242
In [107]:
#Adding the not_high movies back to the test_train_movies_score df 
test_train_movies_score = pd.concat([test_train_movies_score, not_high])
len(test_train_movies_score)
Out[107]:
484
In [108]:
#Changing the data type of month day and year to numeric
columns = ["month", "day"]
test_train_movies_score[columns] = test_train_movies_score[columns].apply(pd.to_numeric)
In [109]:
# We need to remove gross, votes, profit, and percent profit (these columns give an unfair look at the potential 
# imdb rating) and all non numeric data 

#We are creating a test_train_movies_score_exp df that includes the exponential moving averages. We are interested to 
#see if the cumulative or exponential moving averages will help our algorithm the most. 

#Using the exponential moving average to try to predict score first 
columns = list(test_train_movies_score.columns)
columns = ['budget', 'runtime', 'score', 'month', 'day', 'exp_mean_writer', 'exp_mean_director',
 'exp_mean_star',  'exp_mean_company', 'exp_mean_avg']
test_train_movies_score_exp = test_train_movies_score[columns].copy()
In [110]:
#Creating a test_train_movies_score_cumulative df 

#We need to remove gross, votes, profit, and percent profit (these columns give an unfair look at the potential imdb rating) and all non numeric data 
#Getting a list of column names, so we can copy and paste it to select the columns we want. 
columns = list(test_train_movies_score.columns)
columns = ['budget', 'runtime', 'score', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
 'cumulative_mean_star',  'cumulative_mean_company', 'cumulative_mean_avg']
test_train_movies_score_cumulative = test_train_movies_score[columns].copy()
In [111]:
#Creating a df that contains both the cumulative and exponential moving averages 
columns = ['budget',  'runtime', 'score', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
 'cumulative_mean_star',  'cumulative_mean_company', 'cumulative_mean_avg', 'exp_mean_writer', 'exp_mean_director',
 'exp_mean_star',  'exp_mean_company', 'exp_mean_avg']
test_train_movies_score_cumulative_exp = test_train_movies_score[columns].copy()
In [112]:
# #removing the label from the test_train_movies_score_exp df and saving it in a label df
test_train_movies_score_exp_label = test_train_movies_score_exp["score"]
test_train_movies_score_exp.drop("score", axis = 1, inplace = True)
#repeating the process for cumulative 
test_train_movies_score_cumulative_label = test_train_movies_score_cumulative["score"]
test_train_movies_score_cumulative.drop("score", axis = 1, inplace = True)
# #repeating the process for the cumulative and exp combined dfs
test_train_movies_score_cumulative_exp_label = test_train_movies_score_cumulative_exp["score"]
test_train_movies_score_cumulative_exp.drop("score", axis = 1, inplace = True)
In [113]:
#Creating 4 dfs: 1: the training df with label removed, 2: the testing df with label removed, 3: the training label, 4: testing label
#For each test_train df
from sklearn.model_selection import train_test_split 
score_exp_train, score_exp_test, score_exp_train_label, score_exp_test_label = train_test_split(test_train_movies_score_exp, test_train_movies_score_exp_label, test_size = .3, random_state = 123)
score_cumulative_train, score_cumulative_test, score_cumulative_train_label, score_cumulative_test_label = train_test_split(test_train_movies_score_cumulative, test_train_movies_score_cumulative_label, test_size = .3, random_state = 123)
score_cumulative_exp_train, score_cumulative_exp_test, score_cumulative_exp_train_label, score_cumulative_exp_test_label = train_test_split(test_train_movies_score_cumulative_exp, test_train_movies_score_cumulative_exp_label, test_size = .3, random_state = 123)
In [114]:
#Getting a count of high scores in our test label and not_high scores in out test label
#We would prefer to have an equal or almost equal number of movies classified as high and not high in our test set 

Counter(score_exp_test_label)
Out[114]:
Counter({'not_high': 72, 'high': 74})
In [115]:
#Using the standard scale to help preprocess and normalize the data 
# performing preprocessing part 
sc = StandardScaler()
score_exp_train = sc.fit_transform(score_exp_train)
score_exp_test = sc.transform(score_exp_test)
score_cumulative_train = sc.fit_transform(score_cumulative_train)
score_cumulative_test = sc.transform(score_cumulative_test)
score_cumulative_exp_train = sc.fit_transform(score_cumulative_exp_train)
score_cumulative_exp_test = sc.transform(score_cumulative_exp_test)
In [116]:
#Attempt 1: all variables
clf = GaussianNB()
clf.fit(score_exp_train, score_exp_train_label)
test_predicted_exp_nb = clf.predict(score_exp_test)

clf.fit(score_cumulative_train, score_cumulative_train_label)
test_predicted_cumulative_nb = clf.predict(score_cumulative_test)

clf.fit(score_cumulative_exp_train, score_cumulative_exp_train_label)
test_predicted_cumulative_exp_nb = clf.predict(score_cumulative_exp_test)
In [117]:
#Accuracy for exp 
exp_accuracy_nb = accuracy_score(score_exp_test_label, test_predicted_exp_nb, normalize = True)
cm = confusion_matrix(score_exp_test_label, test_predicted_exp_nb)
confusion_matrix_graph(cm, exp_accuracy_nb, "Exp")
Out[117]:
Text(0.5, 1, 'Exp Accuracy Score: 0.7123')
In [118]:
print(classification_report(score_exp_test_label, test_predicted_exp_nb))
              precision    recall  f1-score   support

        high       0.72      0.70      0.71        74
    not_high       0.70      0.72      0.71        72

    accuracy                           0.71       146
   macro avg       0.71      0.71      0.71       146
weighted avg       0.71      0.71      0.71       146

In [119]:
#Accuracy for cumulative 
cum_accuracy_nb = accuracy_score(score_cumulative_test_label, test_predicted_cumulative_nb, normalize = True)
cm = confusion_matrix(score_cumulative_test_label, test_predicted_cumulative_nb)
confusion_matrix_graph(cm, cum_accuracy_nb, "Cumulative")
Out[119]:
Text(0.5, 1, 'Cumulative Accuracy Score: 0.7466')
In [120]:
print(classification_report(score_cumulative_test_label, test_predicted_cumulative_nb))
              precision    recall  f1-score   support

        high       0.74      0.77      0.75        74
    not_high       0.75      0.72      0.74        72

    accuracy                           0.75       146
   macro avg       0.75      0.75      0.75       146
weighted avg       0.75      0.75      0.75       146

In [121]:
#Accuracy for cumulative and exp
cum_exp_accuracy_nb = accuracy_score(score_cumulative_exp_test_label, test_predicted_cumulative_exp_nb, 
                                     normalize = True)
cm = confusion_matrix(score_cumulative_exp_test_label, test_predicted_cumulative_exp_nb)
confusion_matrix_graph(cm, cum_exp_accuracy_nb, "Cumulative & Exp")
Out[121]:
Text(0.5, 1, 'Cumulative & Exp Accuracy Score: 0.7123')
In [122]:
print(classification_report(score_cumulative_exp_test_label, test_predicted_cumulative_exp_nb))
              precision    recall  f1-score   support

        high       0.71      0.74      0.72        74
    not_high       0.72      0.68      0.70        72

    accuracy                           0.71       146
   macro avg       0.71      0.71      0.71       146
weighted avg       0.71      0.71      0.71       146

In [123]:
#The Naive Bayes with the cumulative mean, proved better than the other options... While, it did correctly classify 
# less high scores than the other two models. It had a much better accuracy classifying low scores as well as high 
#scores. It correctly classifying 57 out of 74 high scores and 52 out of 72 not high scores... The other models 
#suffered when classifying not high scores... 
In [124]:
###########################################################################################
# PCA and Logistic Regression 
###########################################################################################
#Running PCA (Principal Component Analysis) to see which variables are the most beneficial for our prediction 
# pca = PCA(n_components = 2)
# score_exp_train_pca = pca.fit_transform(score_exp_train)
# score_exp_test_pca = pca.transform(score_exp_test)
In [125]:
# explained_variance = pca.explained_variance_ratio_
# explained_variance
In [126]:
#Fitting Logistic Regression 
classifier = LogisticRegression(random_state = 723)

classifier.fit(score_exp_train, score_exp_train_label)

#Predicting the test labels 
test_predicted_exp_lg = classifier.predict(score_exp_test)

#getting the accuracy 
exp_accuracy_lg = accuracy_score(score_exp_test_label, test_predicted_exp_lg, normalize = True)
cm = confusion_matrix(score_exp_test_label, test_predicted_exp_lg)
confusion_matrix_graph(cm, exp_accuracy_lg, "LG Exp")
Out[126]:
Text(0.5, 1, 'LG Exp Accuracy Score: 0.726')
In [127]:
print(classification_report(score_exp_test_label, test_predicted_exp_lg))
              precision    recall  f1-score   support

        high       0.72      0.76      0.74        74
    not_high       0.74      0.69      0.71        72

    accuracy                           0.73       146
   macro avg       0.73      0.73      0.73       146
weighted avg       0.73      0.73      0.73       146

In [128]:
#For the cumulative avg 
classifier.fit(score_cumulative_train, score_cumulative_train_label)

#Predicting the test labels 
test_predicted_cumulative_lg = classifier.predict(score_cumulative_test)

#getting the accuracy 
cum_accuracy_lg =accuracy_score(score_cumulative_test_label, test_predicted_cumulative_lg, normalize = True)

#Looking at the confusion matrix 
cm = confusion_matrix(score_cumulative_test_label, test_predicted_cumulative_lg)
confusion_matrix_graph(cm, cum_accuracy_lg, "LG Cumulative")
Out[128]:
Text(0.5, 1, 'LG Cumulative Accuracy Score: 0.726')
In [129]:
print(classification_report(score_cumulative_test_label, test_predicted_cumulative_lg))
              precision    recall  f1-score   support

        high       0.71      0.77      0.74        74
    not_high       0.74      0.68      0.71        72

    accuracy                           0.73       146
   macro avg       0.73      0.73      0.73       146
weighted avg       0.73      0.73      0.73       146

In [130]:
#For the cumulative avg 
classifier.fit(score_cumulative_exp_train, score_cumulative_exp_train_label)

#Predicting the test labels 
test_predicted_cumulative_exp_lg = classifier.predict(score_cumulative_exp_test)

#getting the accuracy 
cum_exp_accuracy_lg = accuracy_score(score_cumulative_exp_test_label, test_predicted_cumulative_exp_lg, normalize = True)
cm = confusion_matrix(score_cumulative_exp_test_label, test_predicted_cumulative_exp_lg)
confusion_matrix_graph(cm, cum_exp_accuracy_lg, "LG Cumulative & Exp")
Out[130]:
Text(0.5, 1, 'LG Cumulative & Exp Accuracy Score: 0.7192')
In [131]:
print(classification_report(score_cumulative_exp_test_label,test_predicted_cumulative_exp_lg))
              precision    recall  f1-score   support

        high       0.71      0.76      0.73        74
    not_high       0.73      0.68      0.71        72

    accuracy                           0.72       146
   macro avg       0.72      0.72      0.72       146
weighted avg       0.72      0.72      0.72       146

In [132]:
#Out of the 3 different testing and training dfs the df with the exponential moving averages proved best for our 
# logistic regression models. We had an overall accuracy of 72.6% and accurately classified high scores 74.4% and 
# not_high scores 64.1% of the time... 
In [133]:
###########################################################################################
# SVMS
###########################################################################################
In [134]:
#Cumulative SVM - Fitting the classifier 
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(score_cumulative_train, score_cumulative_train_label)
#Making the predictions 
test_predicted_cum_svm = svclassifier.predict(score_cumulative_test)
#Creating my confusion matrix 
cum_accuracy_svm = accuracy_score(score_cumulative_test_label, test_predicted_cum_svm, normalize = True)
cm = confusion_matrix(score_cumulative_test_label, test_predicted_cum_svm)
confusion_matrix_graph(cm, cum_accuracy_svm, "SVM Cumulative")
Out[134]:
Text(0.5, 1, 'SVM Cumulative Accuracy Score: 0.7466')
In [135]:
print(classification_report(score_cumulative_test_label, test_predicted_cum_svm))
              precision    recall  f1-score   support

        high       0.73      0.80      0.76        74
    not_high       0.77      0.69      0.73        72

    accuracy                           0.75       146
   macro avg       0.75      0.75      0.75       146
weighted avg       0.75      0.75      0.75       146

In [136]:
#Exp SVM - Fitting the classifier 
svclassifier = SVC(kernel='linear')
svclassifier.fit(score_exp_train, score_exp_train_label)
#Making the predictions 
test_predicted_exp_svm = svclassifier.predict(score_exp_test)
#Creating my confusion matrix 
exp_accuracy_svm = accuracy_score(score_exp_test_label, test_predicted_exp_svm, normalize = True)
cm = confusion_matrix(score_exp_test_label, test_predicted_exp_svm)
confusion_matrix_graph(cm, exp_accuracy_svm, "SVM Exp")
Out[136]:
Text(0.5, 1, 'SVM Exp Accuracy Score: 0.726')
In [137]:
print(classification_report(score_exp_test_label, test_predicted_exp_svm))
              precision    recall  f1-score   support

        high       0.73      0.73      0.73        74
    not_high       0.72      0.72      0.72        72

    accuracy                           0.73       146
   macro avg       0.73      0.73      0.73       146
weighted avg       0.73      0.73      0.73       146

In [138]:
#Exp & Cum SVM - Fitting the classifier 
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(score_cumulative_exp_train, score_cumulative_exp_train_label)
#Making the predictions 
test_predicted_cum_exp_svm = svclassifier.predict(score_cumulative_exp_test)
#Creating my confusion matrix 
cum_exp_accuracy_svm = accuracy_score(score_cumulative_exp_test_label, test_predicted_cum_exp_svm, 
                                      normalize = True)
cm = confusion_matrix(score_cumulative_exp_test_label, test_predicted_cum_exp_svm)
confusion_matrix_graph(cm, cum_exp_accuracy_svm, "SVM Exp & Cumulative")
Out[138]:
Text(0.5, 1, 'SVM Exp & Cumulative Accuracy Score: 0.7192')
In [139]:
print(classification_report(score_cumulative_exp_test_label, test_predicted_cum_exp_svm))
              precision    recall  f1-score   support

        high       0.71      0.76      0.73        74
    not_high       0.73      0.68      0.71        72

    accuracy                           0.72       146
   macro avg       0.72      0.72      0.72       146
weighted avg       0.72      0.72      0.72       146

In [140]:
################################################################################################
#
# Now looking to see if we can predict percent profit
#
################################################################################################

#We will be using the same columns, but instead of having score, we will have percent_profit. 
In [141]:
#We are interested in if a movie will make a profit or not. Therefore, we are only discretizing into 2 categories: 
#negative and postive. 
test_train_movies_pp = test_train_movies_score.copy()
categories = ["negative", "positive"]
# Negative anything less than 0 
#positive anything greater than 0 
test_train_movies_pp["percent_profit"] = pd.cut(test_train_movies_pp["percent_profit"], [-101, 0, 999999], labels = categories)
In [142]:
#Getting the count of each category in our df
test_train_movies_pp_count = test_train_movies_pp.groupby("percent_profit")["percent_profit"].count()
test_train_movies_pp_count
Out[142]:
percent_profit
negative    198
positive    286
Name: percent_profit, dtype: int64
In [143]:
# We are going to create a testing and training df that contains 198 negative, 198 positive percent_profits  
#First we are going to subset the positive percent profits and the negative per+cent_profits 
positive = test_train_movies_pp[test_train_movies_pp["percent_profit"] == "positive"]
test_train_movies_pp = test_train_movies_pp[test_train_movies_pp["percent_profit"] == "negative"]
#Getting the length to make sure that we have 198 negative, 286 postive in our df 
print(len(positive))
print(len(test_train_movies_pp))
286
198
In [144]:
#Now getting a random sample of 198 entries in the positive df and setting the seed to 123
#to reproduce the results 
positive = positive.sample(n = 198, random_state = 123)
#Getting the length to make sure that it worked 
print(len(positive))
198
In [145]:
#Adding the positive movies back to the test_train_movies_pp df 
test_train_movies_pp = pd.concat([test_train_movies_pp, positive])
#Getting the length to make sure that the 2 df were combined correctly and if it did we would have 396 movies in our df
len(test_train_movies_pp)
Out[145]:
396
In [146]:
#Changing the data type of month day and year to numeric
columns = ["month", "day"]
test_train_movies_pp[columns] = test_train_movies_pp[columns].apply(pd.to_numeric)
In [147]:
'''We need to remove gross, votes, profit, and score (these columns give an unfair look at the potential 
imdb rating) and all non numeric data 
'''
#Using the exponential moving average to try to predict score first 
columns = list(test_train_movies_pp.columns)
columns = ['budget', 'runtime', 'percent_profit', 'month', 'day', 'exp_mean_writer', 'exp_mean_director',
 'exp_mean_star',  'exp_mean_company', 'exp_mean_avg']
test_train_movies_pp_exp = test_train_movies_pp[columns].copy()
In [148]:
#We need to remove gross, votes, profit, and percent profit (these columns give an unfair look at the potential imdb rating) and all non numeric data 
#Getting a list of column names, so we can copy and paste it to select the columns we want. 
columns = list(test_train_movies_pp.columns)
columns = ['budget', 'runtime', 'percent_profit', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
 'cumulative_mean_star',  'cumulative_mean_company', 'cumulative_mean_avg']
test_train_movies_pp_cumulative = test_train_movies_pp[columns].copy()
In [149]:
 columns = ['percent_profit', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
 'cumulative_mean_star',  'cumulative_mean_company', 'cumulative_mean_avg', 'exp_mean_writer', 'exp_mean_director',
 'exp_mean_star',  'exp_mean_company', 'exp_mean_avg']
test_train_movies_pp_cumulative_exp = test_train_movies_pp[columns].copy()
In [150]:
# #removing the label from the test_train_movies_spp df and saving it in a label df
test_train_movies_pp_exp_label = test_train_movies_pp_exp["percent_profit"]
test_train_movies_pp_exp.drop("percent_profit", axis = 1, inplace = True)
#repeating the process for cumulative 
test_train_movies_pp_cumulative_label = test_train_movies_pp_cumulative["percent_profit"]
test_train_movies_pp_cumulative.drop("percent_profit", axis = 1, inplace = True)
# #repeating the process for the cumulative and exp combined dfs
test_train_movies_pp_cumulative_exp_label = test_train_movies_pp_cumulative_exp["percent_profit"]
test_train_movies_pp_cumulative_exp.drop("percent_profit", axis = 1, inplace = True)
In [151]:
#Creating 4 df: 1: the training df with label removed, 2: the testing df with label removed, 3: the training label, 4: testing label
from sklearn.model_selection import train_test_split 
pp_exp_train, pp_exp_test, pp_exp_train_label, pp_exp_test_label = train_test_split(test_train_movies_pp_exp, test_train_movies_pp_exp_label, test_size = .3, random_state = 123)
pp_cumulative_train, pp_cumulative_test, pp_cumulative_train_label, pp_cumulative_test_label = train_test_split(test_train_movies_pp_cumulative, test_train_movies_pp_cumulative_label, test_size = .3, random_state = 123)
pp_cumulative_exp_train, pp_cumulative_exp_test, pp_cumulative_exp_train_label, pp_cumulative_exp_test_label = train_test_split(test_train_movies_pp_cumulative_exp, test_train_movies_pp_cumulative_exp_label, test_size = .3, random_state = 123)
In [152]:
#Getting a count of percent_profit in our test label  scores in out test label
#We want to have roughly the same number of positive and negative movies in our test df
Counter(pp_exp_test_label)
Out[152]:
Counter({'negative': 61, 'positive': 58})
In [153]:
#Using the standard scale to help preprocess and normalize the data 
# performing preprocessing part 
sc = StandardScaler()
pp_exp_train = sc.fit_transform(pp_exp_train)
pp_exp_test = sc.transform(pp_exp_test)
pp_cumulative_train = sc.fit_transform(pp_cumulative_train)
pp_cumulative_test = sc.transform(pp_cumulative_test)
pp_cumulative_exp_train = sc.fit_transform(pp_cumulative_exp_train)
pp_cumulative_exp_test = sc.transform(pp_cumulative_exp_test)
In [154]:
#Attempt 1: all variables
clf = GaussianNB()
clf.fit(pp_exp_train, pp_exp_train_label)
test_predicted_exp_svm = clf.predict(pp_exp_test)

clf.fit(pp_cumulative_train, pp_cumulative_train_label)
test_predicted_cumulative_svm = clf.predict(pp_cumulative_test)

clf.fit(pp_cumulative_exp_train, pp_cumulative_exp_train_label)
test_predicted_cumulative_exp_svm = clf.predict(pp_cumulative_exp_test)
In [155]:
#Accuracy for exp 
exp_accuracy_svm = accuracy_score(pp_exp_test_label, test_predicted_exp_svm, normalize = True)
cm = confusion_matrix(pp_exp_test_label, test_predicted_exp_svm)
confusion_matrix_graph(cm, exp_accuracy_svm, "SVM Exp")
Out[155]:
Text(0.5, 1, 'SVM Exp Accuracy Score: 0.5294')
In [156]:
print(classification_report(pp_exp_test_label, test_predicted_exp_svm))
              precision    recall  f1-score   support

    negative       0.54      0.56      0.55        61
    positive       0.52      0.50      0.51        58

    accuracy                           0.53       119
   macro avg       0.53      0.53      0.53       119
weighted avg       0.53      0.53      0.53       119

In [157]:
#Accuracy for cumulative
cum_accuracy_svm = accuracy_score(pp_cumulative_test_label, test_predicted_cumulative_svm, normalize = True)
cm = confusion_matrix(pp_cumulative_test_label, test_predicted_cumulative_svm)
confusion_matrix_graph(cm, cum_accuracy_svm, "SVM Cumulative")
Out[157]:
Text(0.5, 1, 'SVM Cumulative Accuracy Score: 0.521')
In [158]:
#Accuracy for cumulative and exp 
exp_cum_accuracy_svm = accuracy_score(pp_cumulative_exp_test_label, test_predicted_cumulative_exp_svm, normalize = True)
cm = confusion_matrix(pp_cumulative_exp_test_label, test_predicted_cumulative_exp_svm)
confusion_matrix_graph(cm, exp_cum_accuracy_svm, "SVM Exp & Cumulative")
Out[158]:
Text(0.5, 1, 'SVM Exp & Cumulative Accuracy Score: 0.4874')
In [159]:
###########################################################################################
# PCA and Logistic Regression 
###########################################################################################
In [160]:
#Fitting Logistic Regression 
classifier = LogisticRegression(random_state = 723)

classifier.fit(pp_exp_train, pp_exp_train_label)

#Predicting the test labels 
test_predicted_exp_lg = classifier.predict(pp_exp_test)

#getting the accuracy 
exp_accuracy_lg = accuracy_score(pp_exp_test_label, test_predicted_exp_lg, normalize = True)
cm = confusion_matrix(pp_exp_test_label, test_predicted_exp_lg)
confusion_matrix_graph(cm, exp_accuracy_lg, "LG Exp")
Out[160]:
Text(0.5, 1, 'LG Exp Accuracy Score: 0.5126')
In [161]:
print(classification_report(pp_exp_test_label, test_predicted_exp_lg))
              precision    recall  f1-score   support

    negative       0.53      0.51      0.52        61
    positive       0.50      0.52      0.51        58

    accuracy                           0.51       119
   macro avg       0.51      0.51      0.51       119
weighted avg       0.51      0.51      0.51       119

In [162]:
#Fitting Logistic Regression 
classifier = LogisticRegression(random_state = 723)

classifier.fit(pp_cumulative_train, pp_cumulative_train_label)

#Predicting the test labels 
test_predicted_cum_lg = classifier.predict(pp_exp_test)

#getting the accuracy 
cum_accuracy_lg = accuracy_score(pp_cumulative_test_label, test_predicted_cum_lg, normalize = True)
cm = confusion_matrix(pp_cumulative_test_label, test_predicted_cum_lg)
confusion_matrix_graph(cm, cum_accuracy_lg, "LG Cumulative")
Out[162]:
Text(0.5, 1, 'LG Cumulative Accuracy Score: 0.5378')
In [163]:
print(classification_report(pp_cumulative_test_label, test_predicted_cum_lg))
              precision    recall  f1-score   support

    negative       0.55      0.51      0.53        61
    positive       0.52      0.57      0.55        58

    accuracy                           0.54       119
   macro avg       0.54      0.54      0.54       119
weighted avg       0.54      0.54      0.54       119

In [164]:
#Fitting Logistic Regression 
classifier = LogisticRegression(random_state = 723)

classifier.fit(pp_cumulative_exp_train, pp_cumulative_exp_train_label)

#Predicting the test labels 
test_predicted_cum_exp_lg = classifier.predict(pp_cumulative_exp_test)

#getting the accuracy 
cum_exp_accuracy_lg = accuracy_score(pp_cumulative_exp_test_label, test_predicted_cum_exp_lg, normalize = True)
cm = confusion_matrix(pp_cumulative_exp_test_label, test_predicted_cum_exp_lg)
confusion_matrix_graph(cm, cum_exp_accuracy_lg, "LG Exp & Cumulative")
Out[164]:
Text(0.5, 1, 'LG Exp & Cumulative Accuracy Score: 0.4454')
In [165]:
print(classification_report(pp_cumulative_exp_test_label, test_predicted_cum_exp_lg))
              precision    recall  f1-score   support

    negative       0.45      0.38      0.41        61
    positive       0.44      0.52      0.48        58

    accuracy                           0.45       119
   macro avg       0.45      0.45      0.44       119
weighted avg       0.45      0.45      0.44       119

In [166]:
###########################################################################################
# SVMS
###########################################################################################
In [167]:
#Cumulative SVM - Fitting the classifier 
svclassifier = SVC(kernel='rbf')
svclassifier.fit(pp_cumulative_train, pp_cumulative_train_label)
#Making the predictions 
test_predicted_cum_svm = svclassifier.predict(pp_cumulative_test)
#Creating my confusion matrix 
cum_accuracy_svm = accuracy_score(pp_cumulative_test_label, test_predicted_cum_svm, normalize = True)
cm = confusion_matrix(pp_cumulative_test_label, test_predicted_cum_svm)
confusion_matrix_graph(cm, cum_accuracy_svm, "SVM Cumulative")
Out[167]:
Text(0.5, 1, 'SVM Cumulative Accuracy Score: 0.5042')
In [168]:
print(classification_report(pp_cumulative_test_label, test_predicted_cum_svm))
              precision    recall  f1-score   support

    negative       0.52      0.41      0.46        61
    positive       0.49      0.60      0.54        58

    accuracy                           0.50       119
   macro avg       0.51      0.51      0.50       119
weighted avg       0.51      0.50      0.50       119

In [169]:
#Exp SVM - Fitting the classifier 
svclassifier = SVC(kernel='linear')
svclassifier.fit(pp_exp_train, pp_exp_train_label)
#Making the predictions 
test_predicted_exp_svm = svclassifier.predict(pp_exp_test)
#Creating my confusion matrix 
exp_accuracy_svm = accuracy_score(pp_exp_test_label, test_predicted_exp_svm, normalize = True)
cm = confusion_matrix(pp_exp_test_label, test_predicted_exp_svm)
confusion_matrix_graph(cm, exp_accuracy_svm, "SVM Exp")
Out[169]:
Text(0.5, 1, 'SVM Exp Accuracy Score: 0.5714')
In [170]:
print(classification_report(pp_exp_test_label, test_predicted_exp_svm))
              precision    recall  f1-score   support

    negative       0.57      0.64      0.60        61
    positive       0.57      0.50      0.53        58

    accuracy                           0.57       119
   macro avg       0.57      0.57      0.57       119
weighted avg       0.57      0.57      0.57       119

In [182]:
#Exp & Cum SVM - Fitting the classifier 
svclassifier = SVC(kernel='rbf')
svclassifier.fit(pp_cumulative_exp_train, pp_cumulative_exp_train_label)
#Making the predictions 
test_predicted_cum_exp_svm = svclassifier.predict(pp_cumulative_exp_test)
#Creating my confusion matrix 
cum_exp_accuracy_svm = accuracy_score(pp_cumulative_exp_test_label, test_predicted_cum_exp_svm, 
                                      normalize = True)
cm = confusion_matrix(pp_cumulative_exp_test_label, test_predicted_cum_exp_svm)
confusion_matrix_graph(cm, cum_exp_accuracy_svm, "SVM Exp & Cumulative")
Out[182]:
Text(0.5, 1, 'SVM Exp & Cumulative Accuracy Score: 0.479')
In [172]:
print(classification_report(pp_cumulative_exp_test_label, test_predicted_cum_exp_svm))
              precision    recall  f1-score   support

    negative       0.43      0.38      0.40        61
    positive       0.42      0.47      0.44        58

    accuracy                           0.42       119
   macro avg       0.42      0.42      0.42       119
weighted avg       0.42      0.42      0.42       119

In [173]:
# We had much more success when predicting score versus percent profit... However, the SVM with Linear Kernel on 
#the exponential df did have a 57% accuracy.
#We believe that in order to have a better accuracy when predicting percent profit, we need to have more data. Our 
#next step is to find more data and then use the information gained from this analysis on our new data.