IMDB | IST 652 FINAL PROJECT | ALI HO & KENDRA OSBURN¶

=======================================================¶

PART 1: COMPARISON to R¶

=======================================================¶

#Ali and Kendra Final Project 

#importing pandas, csv, 
import csv 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import statistics
#To create testing and training dfs and labels 
from sklearn.model_selection import train_test_split 
#To get a count or tally of a category in our df 
from collections import Counter
# To model the Gaussian Navie Bayes classifier
from sklearn.naive_bayes import GaussianNB
# To calculate the accuracy score of the model
from sklearn.metrics import accuracy_score
#confusion matrix 
from sklearn.metrics import confusion_matrix, classification_report
#for pre-processing to fit all numeric data on the standard scale 
from sklearn.preprocessing import StandardScaler
#for applying PCA function on training and testing sets 
from sklearn.decomposition import PCA
#logistic regression 
from sklearn.linear_model import LogisticRegression
#SVMs 
from sklearn.svm import SVC
#For association rule mining 
from apyori import apriori
#This will allow us to silence the warnings
import warnings 
warnings.simplefilter("ignore")
#For the confusion matrix 
import seaborn as sns

#Functions that we are going to use in our file: 

#Creating a function that will discretize our columns based on quartiles 
def quartile_discretize(df, column, categories): 
    df[column] = pd.qcut(df[column], 4, labels = categories)
    return(df[column])

#Creating a function that will merge our dfs with a left join
def left_merge_2_conditions(df1, df2, column1, column2):
    df = pd.merge(df1, df2, how = "left", on=[column1, column2])
    return(df)

#Creating a function that groups by, counts, creates a new column from the index, drops the index and changes the column names
def groupby_count(df, groupby_column, count_column): 
    new_df = pd.DataFrame(df.groupby(groupby_column)[count_column].count())
    new_df.columns = ["count"]
    new_df[groupby_column] = new_df.index.get_level_values(0)
    new_df.reset_index(drop = True, inplace = True)
    return(new_df)

#Creating a function that groups by, counts, creates a new column from the index, drops the index and changes the column names
def groupby_2_count(df, groupby_column1, groupby_column2, count_column): 
    new_df = pd.DataFrame(df.groupby([groupby_column1, groupby_column2 ])[count_column].count())
    new_df.columns = ["count"]
    new_df[groupby_column1] = new_df.index.get_level_values(0)
    new_df[groupby_column2] = new_df.index.get_level_values(1)
    new_df.reset_index(drop = True, inplace = True)
    return(new_df)

#This will calculate the exponential moving average of the columns we want
#exponential moving averages give more weight to the most recent data and less weight to older data 
def exp_moving_avg(d):
    d["exp_moving_avg"] = d["score"].ewm(span=40,adjust=False).mean()
    exp_moving_avg = list(d["exp_moving_avg"])
    #Adding a 0 to the first entry to exp_moving_avg
    exp_moving_avg = [0] + exp_moving_avg
    #Removing the last entry in the list 
    exp_moving_avg.pop()
    #Creating a column named exp_moving_avg with the results 
    d["exp_moving_avg"] = exp_moving_avg
    return(exp_moving_avg)

#This will calculate the cumulative moving average
def cumulative_moving_avg(d): 
    d["moving_avg"] = d.expanding(min_periods = 1).mean()
    moving_avg = list(d["moving_avg"])
    #Adding a 0 to the first entry to moving avg
    cumulative_moving_avg = [0] + moving_avg
    #Removing the last entry in the list 
    cumulative_moving_avg.pop()
    return(cumulative_moving_avg)

#This will get the list of all of the entries in the column that we are interested in for calculating the averages
def getting_list_of_entries(df, column_interested_in): 
    avg_people = pd.DataFrame(df.groupby([column_interested_in, "released"])["score"].mean())
    avg_column_scores = pd.DataFrame()
    column_interested = list(df[column_interested_in].unique())
    return([avg_people, column_interested])

# Going to use matplotlib for plotting... 
# To create a plot we followed the following formula: 
# df.plot(x-axis, y-axis, kind = type of plot, color = [(we specified colors to use here)], legend = False (we did not 
# want a legend displayed), title = "Title") then we added a ylabel with plt.ylabel("Type label here") and an x label 
# with plt.xlabel("type label here"). Finally, we wanted to change the direction of the xtick names from a 90 degree angle 
# to no angle with plt.xticks(rotation = rotation angle desired)
def bar_graph_count(df, x_column, y_column, title):
    g = df.plot(x_column, y_column, kind = "bar", legend = False, title = title)
    g = plt.ylabel(y_column)
    g = plt.xlabel(x_column)
    return(g)

#This will make a df for our moving averages that we are calculating 
def making_df(people_df, column_interested_in, released, person, cumulative_avg, exp_avg): 
    df_2 = pd.DataFrame({column_interested_in: person, "released": released, "cumulative_mean": cumulative_avg, "exp_mean": exp_avg})
    return(df_2)

#This includes the functions above, and will calculate the exponential and cumulative moving averages for which ever 
#column we specify and return a df will the column interested in, released, cumulative_mean, exp_mean
def calculating_moving_avg(df, column_interested_in):
    people_df = pd.DataFrame()
    people = getting_list_of_entries(df, column_interested_in)
    cumulative_avg = []
    avg_people = people[0]
    avg_people
    for person in people[1]: 
        d = avg_people.groupby(column_interested_in).get_group(person)
        cumulative_avg = cumulative_moving_avg(d)
        exp_avg = exp_moving_avg(d)
        d.reset_index(inplace = True)
        released = d["released"]
        df = pd.DataFrame({column_interested_in: person, "released": released, "cumulative_mean_"+column_interested_in : cumulative_avg, "exp_mean_"+column_interested_in: exp_avg})
        people_df = people_df.append(df)
    return(people_df)

#Confusion Matrix Graph Function 
def confusion_matrix_graph (cm, accuracy_label, type_of_df): 
    g = plt.figure(figsize=(2,2))
    g = sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r', cbar = False);
    g = plt.ylabel('Actual');
    g = plt.xlabel('Predicted');
    g = all_sample_title = type_of_df +' Accuracy Score: {0}'.format(round(accuracy_label, 4))
    g = plt.title(all_sample_title, size = 12);
    return(g)

#reading in the movies.csv file from Kaggle 
movies = pd.read_csv("movies.csv", encoding = "ISO-8859-1")

len(movies)

6820

#Looking at the head of the dataframe 
movies.head()

#Getting the shape of the df 
movies.shape
#We currently have 6,820 rows and 15 columns

(6820, 15)

#Checking to see if we have any missing values... It shows that we do not. 
movies.isnull().sum()

budget      0
company     0
country     0
director    0
genre       0
gross       0
name        0
rating      0
released    0
runtime     0
score       0
star        0
votes       0
writer      0
year        0
dtype: int64

# If we had missing values we would do the following
#We are dropping those rows with the following code
#movies.dropna(inplace = True)

#We are removing any movie that has a budget of 0, because for our machine learning, we will want to predict profit. 
# movies in our df that do not contain gross or budget will not be useful for this. 
movies = movies[movies["budget"] != 0]
#We are removing any movie with a gross of 0 
movies = movies[movies["gross"] != 0]
# movies = movies[movies["production_companies"] != "[]"]
# movies = movies[movies["genres"] !=  "[]"]
len(movies)

4638

#Checking data types of the columns 
movies.dtypes
#Once we are done cleaning the data we are going to change the data types of: company, director, genre, rating, released
#star, writer, and potentially year. If we change them now, when we clean the df and removed rows, the old categories 
#remain, and still show as possible categories.

budget        int64
company      object
country      object
director     object
genre        object
gross         int64
name         object
rating       object
released     object
runtime       int64
score       float64
star         object
votes         int64
writer       object
year          int64
dtype: object

#Need to change the following to date 
    #released,year
movies["released"] = pd.to_datetime(movies["released"])

#Separating the month, day and year into their own columns in case we would like to analyze based on month, day or year
movies["month"], movies["day"] = movies["released"].dt.month, movies["released"].dt.day

#Checking the data types of the columns and making sure the new columns were added 
movies.dtypes

budget               int64
company             object
country             object
director            object
genre               object
gross                int64
name                object
rating              object
released    datetime64[ns]
runtime              int64
score              float64
star                object
votes                int64
writer              object
year                 int64
month                int64
day                  int64
dtype: object

cat = list(range(1,13))
#Changing the month data type from int to ordered category 
movies["month"] = pd.Categorical(movies["month"], ordered = True, categories = cat)

#Making sure it shows as an ordered factor 
movies.month.dtype

CategoricalDtype(categories=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], ordered=True)

movies.dtypes

budget               int64
company             object
country             object
director            object
genre               object
gross                int64
name                object
rating              object
released    datetime64[ns]
runtime              int64
score              float64
star                object
votes                int64
writer              object
year                 int64
month             category
day                  int64
dtype: object

#Getting a list of a the different ratings in our df 
movies["rating"].unique()

array(['R', 'PG-13', 'PG', 'UNRATED', 'G', 'NC-17', 'NOT RATED',
       'Not specified'], dtype=object)

#UNRATED, not specified AND NOT RATED mean the same thing, therefore we are going to change all not rated, not specified entries to unrated 
movies["rating"] = movies["rating"].replace(["NOT RATED", "Not specified"], "UNRATED")

#Checking to make sure that worked: 
movies["rating"].unique()

array(['R', 'PG-13', 'PG', 'UNRATED', 'G', 'NC-17'], dtype=object)

#Changing rating to an ordered factor
#Creating the order that we would like for the ordered factor
cat = ["UNRATED", "G", "PG", "PG-13", "R", "NC-17"]
#Changing to ordered factor 
movies["rating"] = pd.Categorical(movies["rating"], ordered = True, categories = cat)

#Checking to see if it worked 
movies.rating.dtype

CategoricalDtype(categories=['UNRATED', 'G', 'PG', 'PG-13', 'R', 'NC-17'], ordered=True)

#We want to be able to look at the profit for each movie... Therefore we are creating a 
#profit column which is gross - budget 

movies["profit"] = movies["gross"] - movies["budget"]

#Creating a percent profit column to have a normalized way to compare profits. 
#percent_profit = profit/budget*100 
movies["percent_profit"] = movies["profit"]/movies["budget"]*100

movies.head()

#Directors
#Aggregating a moving average column and calculating the mean average imdb score for each actor; by calculating the 
#mean imdb scores for all actors but for only the movies prior to the movie we are calculting the mean for. 
directors_avg = calculating_moving_avg(movies, "director")
#Writers: 
writers_avg = calculating_moving_avg(movies, "writer")
#actors: 
stars_avg = calculating_moving_avg(movies, "star")
#company: 
companies_avg = calculating_moving_avg(movies, "company")

#We are going to use our left_merge_2_conditions function: 
#Inputs: df1, df2, column to merge on 1 and column to merge on 2 
movies = left_merge_2_conditions(movies, directors_avg, "director", "released")
movies = left_merge_2_conditions(movies, writers_avg, "writer", "released")
movies = left_merge_2_conditions(movies, stars_avg, "star", "released")
movies = left_merge_2_conditions(movies, companies_avg, "company", "released")

movies.head()

#Looking to see what happens if we remove all the movies with a 0 for exp_mean_director and exp_mean_star
movies = movies[movies["exp_mean_director"] != 0]
movies = movies[movies["exp_mean_star"] != 0]
movies = movies[movies["exp_mean_writer"] != 0]
movies = movies[movies["exp_mean_company"] != 0]

len(movies) #We still have 883 movies in our df

883

movies.head()

#Creating an aggregated column for the avg writer, director, company, actor cumulative mean 
movies["cumulative_mean_avg"] = (movies["cumulative_mean_writer"] + movies["cumulative_mean_director"] + 
                              movies["cumulative_mean_company"] + movies["cumulative_mean_star"])/4

# Creating an aggregated column for the avg writer, director, company, 
# and actor exponential mean
movies["exp_mean_avg"] = (movies["exp_mean_writer"] + movies["exp_mean_director"] + 
                              movies["exp_mean_company"] + movies["exp_mean_star"])/4

movies.head()

#What is the breakdown of genre in our df?

#Getting the count of movies for each genre in our df and saving it as a pandas df. 
#We are grouping by genre and then getting the count of the genre column in each group by 
#we could have used any column to get the count of... 
#We are using the groupby_count function that takes the following arguments (df, groupby_column, count_column)
movies_genre = groupby_count(movies, "genre", "genre")

movies_genre

#Sorting the df, so the bar graph will be in descending order
movies_genre.sort_values(['count'], ascending=[False], inplace = True)
#Creating a graph of the movies_genre df using our bar_graph_count function. It takes the following inputs: 
# df, x-column, y_column, and title 
bar_graph_count(movies_genre, "genre", "count", "Visualization of Number of Movies Per Genre")

Text(0.5, 0, 'genre')

#Creating a data frame of the movies star count 
movies_star = groupby_count(movies, "star", "genre")

movies_star.head()

# #Creating a subset of the movies_star df that contains only stars that have 2+ movies in our df 
# movies_star = movies_star[movies_star["count"] > 1]
# #Creating a list of the stars that are in our subsetted df
# movies_star_list = list(movies_star["star"])
# movies_star_list
# #Subsetting our movies df to include only stars who are listed in our movies_star_list
# movies = movies[movies.star.isin(movies_star_list)]

movies_star.describe()
#The majority of our 356 stars.  Only 25% of our stars in 3 or more movies. 
# We have 649 stars in our newly reduced df

# How many movies have each star in our df starred in? 

#Looking to see how many stars have starred in 1, 2, 3, 4+ movies 
movies_star = groupby_count(movies, "star", "genre")

#Changing the column names for our movies_star
movies_star.columns = ["number_of_movies", "star"]
movies_star = groupby_count(movies_star, "number_of_movies", "star")

movies_star

#Changing the column names 
movies_star.columns = ["number_of_stars", "number_of_movies"]

#Sorting the df, so the bar graph will be in descending order
movies_star.sort_values(['number_of_movies'], ascending=[True], inplace = True)
#Creating a graph of the movies_star df using our bar_graph_count function. It takes the following inputs: 
# df, x-column, y_column, and title 
bar_graph_count(movies_star, "number_of_movies", "number_of_stars", "Visualization of the The Number of Movies each Star has in our DF")

Text(0.5, 0, 'number_of_movies')

#Creating a data frame of the movies director count 
movies_director = groupby_count(movies, "director", "genre")
movies_director.columns = ["director_count", "director"]

# #Creating a subset of the movies_director df that contains only directors that have 2 movies in our df 
# movies_director = movies_director[movies_director["director_count"] > 1]
# #Creating a list of the stars that are in our subsetted df
# movies_director_list = list(movies_director["director"])
# movies_director_list
# #Subsetting our movies df to include only stars who are listed in our movies_star_list
# movies = movies[movies.director.isin(movies_director_list)]

# How many movies have each director in our df produced? 

#Creating a new groupby to see how many directors we have that have produced 1, 2, 3, 4... movies 
movies_director = groupby_count(movies_director, "director_count", "director")

#Getting a tally of the number of directors that have the same number of movies 
movies_director.columns = ["number_of_movies", "director_count"]
movies_director

#Creating a graph of the movies_director df 
bar_graph_count(movies_director, "director_count", "number_of_movies", "Visualization of the The Number of Movies each Director has in our DF")

Text(0.5, 0, 'director_count')

# How many movies have each company in our df produced? 
#Creating a new groupby to see how many company we have that have produced 1, 2, 3, 4... movies 
movies_company = groupby_count(movies, "company", "star")
movies_company.columns = ["number_of_movies", "company"]
movies_company = groupby_count(movies_company, "number_of_movies", "company")
movies_company.columns = ["number_of_companies", "number_of_movies"]
movies_company

#Creating a graph of the movies_company df 
bar_graph_count(movies_company, "number_of_movies", "number_of_companies", "Visualization of the The Number of Movies each Company has in our DF")

Text(0.5, 0, 'number_of_movies')

#How are the movies in our df distributed by year?

#Looking to see how the movies are distributed by year 
movies_year = groupby_count(movies, "year", "star")
movies_year

#Creating a graph of the movies_year df 
bar_graph_count(movies_year, "year", "count", "Visualization of Number of Movies Per Year")

Text(0.5, 0, 'year')

# What is the breakdown of the number of movies per genre per year?

#Looking at the number of movies per genre per year 
#Looking to see how the movies are distributed by year 
#We are using the groupby_2 function that takes the following arguements: 
#(df, groupby_column1, groupby_column2, agg_column)
movies_year_genre = groupby_2_count(movies, "year", "genre", "star")

movies_year_genre.head()

#To visualize this, we are going to use a grouped bar graph. Our bar_graph_count function does not apply to this 
#situation... We are going to use the pivot function 
# Attempting to graph this data using a grouped bar chart: 
# formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"], 
# title = "you can set the title of your graph here")
movies_year_genre.pivot("year", "genre", "count").plot(kind="bar", title = "Visualization of Genre Breakdown by Year")
#Creating a y axis label 
plt.ylabel("Number of Movies")
#Changing the x axis label 
plt.xlabel("Year")
#Changing the label to display male and female instead of 0 and 1
plt.legend(loc = "lower center", bbox_to_anchor = (.5, -.8), ncol = 4, title = "Genre")

#This was not very helpful, we are going to subset the data by 5 year chunks and regraph

<matplotlib.legend.Legend at 0x27e096ac8d0>

'''Come back here if time prevails to do what is said above'''

'Come back here if time prevails to do what is said above'

#What are the most prolific months for movies to be released in our dataset?

#Looking at month 
movies_month = groupby_count(movies, "month", "star")
movies_month

#Creating a new column called month with the names of the month of the year, since the df is ordered, we know that 
#the months of the year can be added this way 
movies_month["month"] = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"]

#Changing the data type of the month column to ordered categorical 
cat = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sept", "Oct", "Nov", "Dec"]
movies_month["month"] = pd.Categorical(movies_month["month"], ordered = True, categories = cat)

movies_month.month.dtype

CategoricalDtype(categories=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug',
                  'Sept', 'Oct', 'Nov', 'Dec'],
                 ordered=True)

movies_month

bar_graph_count(movies_month, "month", "count", "Visualization of Number of Movies by Month")

Text(0.5, 0, 'month')

#Descritizing the df based on all of the numeric columns: 
#Creating a new df name movies_descritized 
movies_discretized = movies.copy()

movies_discretized.columns

Index(['budget', 'company', 'country', 'director', 'genre', 'gross', 'name',
       'rating', 'released', 'runtime', 'score', 'star', 'votes', 'writer',
       'year', 'month', 'day', 'profit', 'percent_profit',
       'cumulative_mean_director', 'exp_mean_director',
       'cumulative_mean_writer', 'exp_mean_writer', 'cumulative_mean_star',
       'exp_mean_star', 'cumulative_mean_company', 'exp_mean_company',
       'cumulative_mean_avg', 'exp_mean_avg'],
      dtype='object')

# We are going to descritize our data based on the quartiles. The categories are: 
# extremely_low, low, high, extremely_high
# We are using our quartile_discretize function that takes the following arguments: 
#(df, column, category)
categories = ["extremely_low", "low", "high", "extremely_high"]
movies_discretized["budget"] = quartile_discretize(movies_discretized, "budget", categories)

#Checking to make sure it worked 
movies_discretized.budget.dtype

CategoricalDtype(categories=['extremely_low', 'low', 'high', 'extremely_high'], ordered=True)

#Gross: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["gross"] = quartile_discretize(movies_discretized, "gross", categories)

#Checking to see that it worked 
movies_discretized.gross.dtype

CategoricalDtype(categories=['extremely_low', 'low', 'high', 'extremely_high'], ordered=True)

#Score: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["score"] = quartile_discretize(movies_discretized, "score", categories)

#Checking to see that it worked 
movies_discretized.score.dtype

CategoricalDtype(categories=['extremely_low', 'low', 'high', 'extremely_high'], ordered=True)

#Votes: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["votes"] = quartile_discretize(movies_discretized, "votes", categories)

movies_discretized.votes.dtype

CategoricalDtype(categories=['extremely_low', 'low', 'high', 'extremely_high'], ordered=True)

#writer_mean: We are using the same categories and discretizing based on quantiles 25%, 50%, 75% 100%
movies_discretized["cumulative_mean_writer"] = quartile_discretize(movies_discretized,"cumulative_mean_writer", categories)
movies_discretized["exp_mean_writer"] = quartile_discretize(movies_discretized,"exp_mean_writer", categories)
movies_discretized["cumulative_mean_director"] = quartile_discretize(movies_discretized, "cumulative_mean_director", categories)
movies_discretized["exp_mean_director"] = quartile_discretize(movies_discretized,"exp_mean_director", categories)
movies_discretized["cumulative_mean_star"] = quartile_discretize(movies_discretized, "cumulative_mean_star", categories)
movies_discretized["exp_mean_star"] = quartile_discretize(movies_discretized, "exp_mean_star", categories)
movies_discretized["cumulative_mean_company"] = quartile_discretize(movies_discretized, "cumulative_mean_company", categories)
movies_discretized["exp_mean_company"] = quartile_discretize(movies_discretized, "exp_mean_company", categories)

#We are creating new label categories 
categories = ["extremely_short", "short", "long", "extremely_long"]
movies_discretized["runtime"] = quartile_discretize(movies_discretized, "runtime", categories)

#Checking to see if that worked 
movies_discretized.runtime.dtype

CategoricalDtype(categories=['extremely_short', 'short', 'long', 'extremely_long'], ordered=True)

movies_discretized.percent_profit.describe()

count     883.000000
mean       47.168989
std       171.138287
min       -99.976634
25%       -44.219378
50%         6.505111
75%        80.332174
max      1707.844444
Name: percent_profit, dtype: float64

#We are creating new label categories ; Discretized Percent Profit 
#We cannot use our function on this, because we are not discretizing by quartiles
categories = ["negative", "low", "high", "extremely_high"]
movies_discretized["percent_profit"] = pd.cut(movies_discretized["percent_profit"], [-100, 0, 50, 150, 999999], labels = categories)

#Checking to see if it worked 
movies_discretized.percent_profit.dtype

CategoricalDtype(categories=['negative', 'low', 'high', 'extremely_high'], ordered=True)

movies_discretized.profit.describe()

count    8.830000e+02
mean     1.691477e+07
std      6.503097e+07
min     -1.286123e+08
25%     -1.660067e+07
50%      2.697999e+06
75%      3.387665e+07
max      5.022706e+08
Name: profit, dtype: float64

#Profit
#We cannot use our function on this, because we are not discretizing by quartiles
movies_discretized["profit"] = pd.cut(movies_discretized["profit"], [-9999999999, 0, 1000000, 50000000, 999999999], labels = categories)

movies_discretized.head()

#We are setting new categories for the day column by creating a new column for week 
# week_1 is the first 7 days of the month, week_2 is days 8 - 14, week_3 is days 15 - 21, and week_4 are the 
# rest of the days
categories = ["week_1", "week_2", "week_3", "week_4"]

movies_discretized["week"] = pd.cut(movies_discretized["day"], [0, 8, 15, 22, 32], labels = categories)

movies_discretized.week.dtype

CategoricalDtype(categories=['week_1', 'week_2', 'week_3', 'week_4'], ordered=True)

#Looking at the relationship between genre and percent profit 
movies_discretized_genre_pp = groupby_2_count(movies_discretized, "genre", "percent_profit", "score")

movies_discretized_genre_pp.head()

#Now we are getting the sum of each genre category... We do not have a function for sum... we could go back and rework 
#our function.
movies_discretized_genre_pp.groupby("genre")["count"].sum()

genre
Action       301
Adventure     54
Animation     33
Biography     38
Comedy       233
Crime         69
Drama        116
Fantasy        4
Horror        27
Mystery        7
Romance        1
Name: count, dtype: int64

movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre"]

'''We ultimately want a column that contains the total counts for each genre group. We are probably doing this in 
a roundabout way, but as I am extremely new to python this is the best way I can think of doing it. We are going to create 
a new column that replicates the genre column called genre_count and then we will use the replace function to 
replace the genre names with their total count '''
#First, replicating the income level column in a column named budget_category_count
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre"] 
#Now replacing the income level with the total count for each income level 
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Action"], 301 )
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Adventure"], 54)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Animation"], 33)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Biography"], 38)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Comedy"], 233)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Crime"], 69)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Drama"], 116)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Fantasy"], 4)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Horror"], 27)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Mystery"], 7)
movies_discretized_genre_pp["genre_count"] = movies_discretized_genre_pp["genre_count"].replace(["Romance"], 1)

movies_discretized_genre_pp.head()

movies_discretized_genre_pp["genre_count"] = pd.to_numeric(movies_discretized_genre_pp["genre_count"])

#Okay, we are one step closer... Now, we need to create a column that takes the counts/genre_counts * 100 
movies_discretized_genre_pp["percent"] = movies_discretized_genre_pp["count"]/movies_discretized_genre_pp["genre_count"] *100

movies_discretized_genre_pp.head()

# Attempting to graph this data using a grouped bar chart: 
# formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"], 
# title = "you can set the title of your graph here")
graph = movies_discretized_genre_pp.pivot("genre", "percent_profit", 
                                                "percent").plot(kind="bar", color = ["crimson", "salmon", "palegreen", "darkgreen"],
                                                               title = "Percent of Percent Profit to Genre Category")
#Changing the y label of our graph to Percent
plt.ylabel("Percent")
#Changing the x axis label of our graph to Budget Category 
plt.xlabel("Genre")
#How to change the tick labels (we ended up not needing this, but want to keep for future reference)
#plt.Axes.set_xticklabels(graph, labels = ['extremely low', 'low', 'high', 'extremely high'])
#moving the legend position to underneath the graph, also setting it to have 4 columns so the legend is in a 
#straight single line and adding a legend title
plt.legend( loc = "lower center", bbox_to_anchor = (.5, -.6), ncol = 4, title = "Percent Makeup of Genre Category")

<matplotlib.legend.Legend at 0x27e0abb4c18>

##############################################################################################################
# We are going to implement machine learning to see if we can predict either tmdb score or percent profit
##############################################################################################################
#
##############################################################################################################
# We are going to create an all numeric df, that does not contain the profit, percent profit, votes, and gross. 
# This information would give us an unfair look when trying to predict either score or percent profit, as these 
# metrics would not be available prior to the release of the movie. 
################################################################################################################
# Potential Machine Learning algorithms we can use on our all numeric df: 
# logistic regression
# Naive Bayes 
# k-means 
# Decision Tree 
# knn 
# svm
################################################################################################################
#
#
################################################################################################################
# We are also going to create an all discretized df
################################################################################################################
################################################################################################################
# Potential Machine Learning algorithms we can use on our discretized df: 
# Decision Tree
# Association Rule Mining 
# Random Forest 
################################################################################################################

#################################################################
#Naive Bayes 
#**All Numerica Data *** 
#################################################################

#Creating testing and training datasets comprised of all numeric data for score
test_train_movies = movies.copy()

test_train_movies_score = test_train_movies.copy()

test_train_movies_score.score.describe()

count    883.000000
mean       6.461608
std        0.946882
min        2.600000
25%        5.900000
50%        6.500000
75%        7.100000
max        9.000000
Name: score, dtype: float64

#We believe that people will be most interested in if they will get a high imdb score and not if they will have an 
#average or low score... Due to this, we have decided to discretize score in 2 categories: high and not_high. 
#We have decided that a high score is 7+. 
test_train_movies_score = test_train_movies_score.copy()
categories = ["not_high", "high"]
# not_high = 0 - 7 
# high = 7 - 10
test_train_movies_score["score"] = pd.cut(test_train_movies_score["score"], [0, 7, 10], labels = categories)

#Getting a count of the number of movies that are classified as high and not high in our df. 
test_train_movies_score_count = test_train_movies_score.groupby("score")["score"].count()
test_train_movies_score_count

score
not_high    641
high        242
Name: score, dtype: int64

# We are going to create a testing and training df that contains 242 not_high entries and 242 high entries 
#First we are going to subset the not_high scores and the high scores 
not_high = test_train_movies_score[test_train_movies_score["score"] == "not_high"]
test_train_movies_score = test_train_movies_score[test_train_movies_score["score"] == "high"]
#Getting the length to make sure that we have 641 movies
len(not_high)

641

#Checking to make sure that the test_train_movies_score is equal to the 242 high movies 
len(test_train_movies_score)

242

#Now getting a random sample of 242 entries in the not_high df and setting the seed to 123 to reproduce the results 
not_high = not_high.sample(n = 242, random_state = 123)
#Getting the length to make sure that it worked 
len(not_high)

242

#Adding the not_high movies back to the test_train_movies_score df 
test_train_movies_score = pd.concat([test_train_movies_score, not_high])
len(test_train_movies_score)

484

#Changing the data type of month day and year to numeric
columns = ["month", "day"]
test_train_movies_score[columns] = test_train_movies_score[columns].apply(pd.to_numeric)

# We need to remove gross, votes, profit, and percent profit (these columns give an unfair look at the potential 
# imdb rating) and all non numeric data 

#We are creating a test_train_movies_score_exp df that includes the exponential moving averages. We are interested to 
#see if the cumulative or exponential moving averages will help our algorithm the most. 

#Using the exponential moving average to try to predict score first 
columns = list(test_train_movies_score.columns)
columns = ['budget', 'runtime', 'score', 'month', 'day', 'exp_mean_writer', 'exp_mean_director',
 'exp_mean_star',  'exp_mean_company', 'exp_mean_avg']
test_train_movies_score_exp = test_train_movies_score[columns].copy()

#Creating a test_train_movies_score_cumulative df 

#We need to remove gross, votes, profit, and percent profit (these columns give an unfair look at the potential imdb rating) and all non numeric data 
#Getting a list of column names, so we can copy and paste it to select the columns we want. 
columns = list(test_train_movies_score.columns)
columns = ['budget', 'runtime', 'score', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
 'cumulative_mean_star',  'cumulative_mean_company', 'cumulative_mean_avg']
test_train_movies_score_cumulative = test_train_movies_score[columns].copy()

#Creating a df that contains both the cumulative and exponential moving averages 
columns = ['budget',  'runtime', 'score', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
 'cumulative_mean_star',  'cumulative_mean_company', 'cumulative_mean_avg', 'exp_mean_writer', 'exp_mean_director',
 'exp_mean_star',  'exp_mean_company', 'exp_mean_avg']
test_train_movies_score_cumulative_exp = test_train_movies_score[columns].copy()

# #removing the label from the test_train_movies_score_exp df and saving it in a label df
test_train_movies_score_exp_label = test_train_movies_score_exp["score"]
test_train_movies_score_exp.drop("score", axis = 1, inplace = True)
#repeating the process for cumulative 
test_train_movies_score_cumulative_label = test_train_movies_score_cumulative["score"]
test_train_movies_score_cumulative.drop("score", axis = 1, inplace = True)
# #repeating the process for the cumulative and exp combined dfs
test_train_movies_score_cumulative_exp_label = test_train_movies_score_cumulative_exp["score"]
test_train_movies_score_cumulative_exp.drop("score", axis = 1, inplace = True)

#Creating 4 dfs: 1: the training df with label removed, 2: the testing df with label removed, 3: the training label, 4: testing label
#For each test_train df
from sklearn.model_selection import train_test_split 
score_exp_train, score_exp_test, score_exp_train_label, score_exp_test_label = train_test_split(test_train_movies_score_exp, test_train_movies_score_exp_label, test_size = .3, random_state = 123)
score_cumulative_train, score_cumulative_test, score_cumulative_train_label, score_cumulative_test_label = train_test_split(test_train_movies_score_cumulative, test_train_movies_score_cumulative_label, test_size = .3, random_state = 123)
score_cumulative_exp_train, score_cumulative_exp_test, score_cumulative_exp_train_label, score_cumulative_exp_test_label = train_test_split(test_train_movies_score_cumulative_exp, test_train_movies_score_cumulative_exp_label, test_size = .3, random_state = 123)

#Getting a count of high scores in our test label and not_high scores in out test label
#We would prefer to have an equal or almost equal number of movies classified as high and not high in our test set 

Counter(score_exp_test_label)

Counter({'not_high': 72, 'high': 74})

#Using the standard scale to help preprocess and normalize the data 
# performing preprocessing part 
sc = StandardScaler()
score_exp_train = sc.fit_transform(score_exp_train)
score_exp_test = sc.transform(score_exp_test)
score_cumulative_train = sc.fit_transform(score_cumulative_train)
score_cumulative_test = sc.transform(score_cumulative_test)
score_cumulative_exp_train = sc.fit_transform(score_cumulative_exp_train)
score_cumulative_exp_test = sc.transform(score_cumulative_exp_test)

#Attempt 1: all variables
clf = GaussianNB()
clf.fit(score_exp_train, score_exp_train_label)
test_predicted_exp_nb = clf.predict(score_exp_test)

clf.fit(score_cumulative_train, score_cumulative_train_label)
test_predicted_cumulative_nb = clf.predict(score_cumulative_test)

clf.fit(score_cumulative_exp_train, score_cumulative_exp_train_label)
test_predicted_cumulative_exp_nb = clf.predict(score_cumulative_exp_test)

#Accuracy for exp 
exp_accuracy_nb = accuracy_score(score_exp_test_label, test_predicted_exp_nb, normalize = True)
cm = confusion_matrix(score_exp_test_label, test_predicted_exp_nb)
confusion_matrix_graph(cm, exp_accuracy_nb, "Exp")

Text(0.5, 1, 'Exp Accuracy Score: 0.7123')

print(classification_report(score_exp_test_label, test_predicted_exp_nb))

              precision    recall  f1-score   support

        high       0.72      0.70      0.71        74
    not_high       0.70      0.72      0.71        72

    accuracy                           0.71       146
   macro avg       0.71      0.71      0.71       146
weighted avg       0.71      0.71      0.71       146

#Accuracy for cumulative 
cum_accuracy_nb = accuracy_score(score_cumulative_test_label, test_predicted_cumulative_nb, normalize = True)
cm = confusion_matrix(score_cumulative_test_label, test_predicted_cumulative_nb)
confusion_matrix_graph(cm, cum_accuracy_nb, "Cumulative")

Text(0.5, 1, 'Cumulative Accuracy Score: 0.7466')

print(classification_report(score_cumulative_test_label, test_predicted_cumulative_nb))

              precision    recall  f1-score   support

        high       0.74      0.77      0.75        74
    not_high       0.75      0.72      0.74        72

    accuracy                           0.75       146
   macro avg       0.75      0.75      0.75       146
weighted avg       0.75      0.75      0.75       146

#Accuracy for cumulative and exp
cum_exp_accuracy_nb = accuracy_score(score_cumulative_exp_test_label, test_predicted_cumulative_exp_nb, 
                                     normalize = True)
cm = confusion_matrix(score_cumulative_exp_test_label, test_predicted_cumulative_exp_nb)
confusion_matrix_graph(cm, cum_exp_accuracy_nb, "Cumulative & Exp")

Text(0.5, 1, 'Cumulative & Exp Accuracy Score: 0.7123')

print(classification_report(score_cumulative_exp_test_label, test_predicted_cumulative_exp_nb))

              precision    recall  f1-score   support

        high       0.71      0.74      0.72        74
    not_high       0.72      0.68      0.70        72

    accuracy                           0.71       146
   macro avg       0.71      0.71      0.71       146
weighted avg       0.71      0.71      0.71       146

#The Naive Bayes with the cumulative mean, proved better than the other options... While, it did correctly classify 
# less high scores than the other two models. It had a much better accuracy classifying low scores as well as high 
#scores. It correctly classifying 57 out of 74 high scores and 52 out of 72 not high scores... The other models 
#suffered when classifying not high scores...

###########################################################################################
# PCA and Logistic Regression 
###########################################################################################
#Running PCA (Principal Component Analysis) to see which variables are the most beneficial for our prediction 
# pca = PCA(n_components = 2)
# score_exp_train_pca = pca.fit_transform(score_exp_train)
# score_exp_test_pca = pca.transform(score_exp_test)

# explained_variance = pca.explained_variance_ratio_
# explained_variance

#Fitting Logistic Regression 
classifier = LogisticRegression(random_state = 723)

classifier.fit(score_exp_train, score_exp_train_label)

#Predicting the test labels 
test_predicted_exp_lg = classifier.predict(score_exp_test)

#getting the accuracy 
exp_accuracy_lg = accuracy_score(score_exp_test_label, test_predicted_exp_lg, normalize = True)
cm = confusion_matrix(score_exp_test_label, test_predicted_exp_lg)
confusion_matrix_graph(cm, exp_accuracy_lg, "LG Exp")

Text(0.5, 1, 'LG Exp Accuracy Score: 0.726')

print(classification_report(score_exp_test_label, test_predicted_exp_lg))

              precision    recall  f1-score   support

        high       0.72      0.76      0.74        74
    not_high       0.74      0.69      0.71        72

    accuracy                           0.73       146
   macro avg       0.73      0.73      0.73       146
weighted avg       0.73      0.73      0.73       146

#For the cumulative avg 
classifier.fit(score_cumulative_train, score_cumulative_train_label)

#Predicting the test labels 
test_predicted_cumulative_lg = classifier.predict(score_cumulative_test)

#getting the accuracy 
cum_accuracy_lg =accuracy_score(score_cumulative_test_label, test_predicted_cumulative_lg, normalize = True)

#Looking at the confusion matrix 
cm = confusion_matrix(score_cumulative_test_label, test_predicted_cumulative_lg)
confusion_matrix_graph(cm, cum_accuracy_lg, "LG Cumulative")

Text(0.5, 1, 'LG Cumulative Accuracy Score: 0.726')

print(classification_report(score_cumulative_test_label, test_predicted_cumulative_lg))

              precision    recall  f1-score   support

        high       0.71      0.77      0.74        74
    not_high       0.74      0.68      0.71        72

    accuracy                           0.73       146
   macro avg       0.73      0.73      0.73       146
weighted avg       0.73      0.73      0.73       146

#For the cumulative avg 
classifier.fit(score_cumulative_exp_train, score_cumulative_exp_train_label)

#Predicting the test labels 
test_predicted_cumulative_exp_lg = classifier.predict(score_cumulative_exp_test)

#getting the accuracy 
cum_exp_accuracy_lg = accuracy_score(score_cumulative_exp_test_label, test_predicted_cumulative_exp_lg, normalize = True)
cm = confusion_matrix(score_cumulative_exp_test_label, test_predicted_cumulative_exp_lg)
confusion_matrix_graph(cm, cum_exp_accuracy_lg, "LG Cumulative & Exp")

Text(0.5, 1, 'LG Cumulative & Exp Accuracy Score: 0.7192')

print(classification_report(score_cumulative_exp_test_label,test_predicted_cumulative_exp_lg))

              precision    recall  f1-score   support

        high       0.71      0.76      0.73        74
    not_high       0.73      0.68      0.71        72

    accuracy                           0.72       146
   macro avg       0.72      0.72      0.72       146
weighted avg       0.72      0.72      0.72       146

#Out of the 3 different testing and training dfs the df with the exponential moving averages proved best for our 
# logistic regression models. We had an overall accuracy of 72.6% and accurately classified high scores 74.4% and 
# not_high scores 64.1% of the time...

###########################################################################################
# SVMS
###########################################################################################

#Cumulative SVM - Fitting the classifier 
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(score_cumulative_train, score_cumulative_train_label)
#Making the predictions 
test_predicted_cum_svm = svclassifier.predict(score_cumulative_test)
#Creating my confusion matrix 
cum_accuracy_svm = accuracy_score(score_cumulative_test_label, test_predicted_cum_svm, normalize = True)
cm = confusion_matrix(score_cumulative_test_label, test_predicted_cum_svm)
confusion_matrix_graph(cm, cum_accuracy_svm, "SVM Cumulative")

Text(0.5, 1, 'SVM Cumulative Accuracy Score: 0.7466')

print(classification_report(score_cumulative_test_label, test_predicted_cum_svm))

              precision    recall  f1-score   support

        high       0.73      0.80      0.76        74
    not_high       0.77      0.69      0.73        72

    accuracy                           0.75       146
   macro avg       0.75      0.75      0.75       146
weighted avg       0.75      0.75      0.75       146

#Exp SVM - Fitting the classifier 
svclassifier = SVC(kernel='linear')
svclassifier.fit(score_exp_train, score_exp_train_label)
#Making the predictions 
test_predicted_exp_svm = svclassifier.predict(score_exp_test)
#Creating my confusion matrix 
exp_accuracy_svm = accuracy_score(score_exp_test_label, test_predicted_exp_svm, normalize = True)
cm = confusion_matrix(score_exp_test_label, test_predicted_exp_svm)
confusion_matrix_graph(cm, exp_accuracy_svm, "SVM Exp")

Text(0.5, 1, 'SVM Exp Accuracy Score: 0.726')

print(classification_report(score_exp_test_label, test_predicted_exp_svm))

              precision    recall  f1-score   support

        high       0.73      0.73      0.73        74
    not_high       0.72      0.72      0.72        72

    accuracy                           0.73       146
   macro avg       0.73      0.73      0.73       146
weighted avg       0.73      0.73      0.73       146

#Exp & Cum SVM - Fitting the classifier 
svclassifier = SVC(kernel='sigmoid')
svclassifier.fit(score_cumulative_exp_train, score_cumulative_exp_train_label)
#Making the predictions 
test_predicted_cum_exp_svm = svclassifier.predict(score_cumulative_exp_test)
#Creating my confusion matrix 
cum_exp_accuracy_svm = accuracy_score(score_cumulative_exp_test_label, test_predicted_cum_exp_svm, 
                                      normalize = True)
cm = confusion_matrix(score_cumulative_exp_test_label, test_predicted_cum_exp_svm)
confusion_matrix_graph(cm, cum_exp_accuracy_svm, "SVM Exp & Cumulative")

Text(0.5, 1, 'SVM Exp & Cumulative Accuracy Score: 0.7192')

print(classification_report(score_cumulative_exp_test_label, test_predicted_cum_exp_svm))

              precision    recall  f1-score   support

        high       0.71      0.76      0.73        74
    not_high       0.73      0.68      0.71        72

    accuracy                           0.72       146
   macro avg       0.72      0.72      0.72       146
weighted avg       0.72      0.72      0.72       146

################################################################################################
#
# Now looking to see if we can predict percent profit
#
################################################################################################

#We will be using the same columns, but instead of having score, we will have percent_profit.

#We are interested in if a movie will make a profit or not. Therefore, we are only discretizing into 2 categories: 
#negative and postive. 
test_train_movies_pp = test_train_movies_score.copy()
categories = ["negative", "positive"]
# Negative anything less than 0 
#positive anything greater than 0 
test_train_movies_pp["percent_profit"] = pd.cut(test_train_movies_pp["percent_profit"], [-101, 0, 999999], labels = categories)

#Getting the count of each category in our df
test_train_movies_pp_count = test_train_movies_pp.groupby("percent_profit")["percent_profit"].count()
test_train_movies_pp_count

percent_profit
negative    198
positive    286
Name: percent_profit, dtype: int64

# We are going to create a testing and training df that contains 198 negative, 198 positive percent_profits  
#First we are going to subset the positive percent profits and the negative per+cent_profits 
positive = test_train_movies_pp[test_train_movies_pp["percent_profit"] == "positive"]
test_train_movies_pp = test_train_movies_pp[test_train_movies_pp["percent_profit"] == "negative"]
#Getting the length to make sure that we have 198 negative, 286 postive in our df 
print(len(positive))
print(len(test_train_movies_pp))

286
198

#Now getting a random sample of 198 entries in the positive df and setting the seed to 123
#to reproduce the results 
positive = positive.sample(n = 198, random_state = 123)
#Getting the length to make sure that it worked 
print(len(positive))

198

#Adding the positive movies back to the test_train_movies_pp df 
test_train_movies_pp = pd.concat([test_train_movies_pp, positive])
#Getting the length to make sure that the 2 df were combined correctly and if it did we would have 396 movies in our df
len(test_train_movies_pp)

396

#Changing the data type of month day and year to numeric
columns = ["month", "day"]
test_train_movies_pp[columns] = test_train_movies_pp[columns].apply(pd.to_numeric)

'''We need to remove gross, votes, profit, and score (these columns give an unfair look at the potential 
imdb rating) and all non numeric data 
'''
#Using the exponential moving average to try to predict score first 
columns = list(test_train_movies_pp.columns)
columns = ['budget', 'runtime', 'percent_profit', 'month', 'day', 'exp_mean_writer', 'exp_mean_director',
 'exp_mean_star',  'exp_mean_company', 'exp_mean_avg']
test_train_movies_pp_exp = test_train_movies_pp[columns].copy()

#We need to remove gross, votes, profit, and percent profit (these columns give an unfair look at the potential imdb rating) and all non numeric data 
#Getting a list of column names, so we can copy and paste it to select the columns we want. 
columns = list(test_train_movies_pp.columns)
columns = ['budget', 'runtime', 'percent_profit', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
 'cumulative_mean_star',  'cumulative_mean_company', 'cumulative_mean_avg']
test_train_movies_pp_cumulative = test_train_movies_pp[columns].copy()

 columns = ['percent_profit', "month", 'day', 'cumulative_mean_writer', 'cumulative_mean_director',
 'cumulative_mean_star',  'cumulative_mean_company', 'cumulative_mean_avg', 'exp_mean_writer', 'exp_mean_director',
 'exp_mean_star',  'exp_mean_company', 'exp_mean_avg']
test_train_movies_pp_cumulative_exp = test_train_movies_pp[columns].copy()

# #removing the label from the test_train_movies_spp df and saving it in a label df
test_train_movies_pp_exp_label = test_train_movies_pp_exp["percent_profit"]
test_train_movies_pp_exp.drop("percent_profit", axis = 1, inplace = True)
#repeating the process for cumulative 
test_train_movies_pp_cumulative_label = test_train_movies_pp_cumulative["percent_profit"]
test_train_movies_pp_cumulative.drop("percent_profit", axis = 1, inplace = True)
# #repeating the process for the cumulative and exp combined dfs
test_train_movies_pp_cumulative_exp_label = test_train_movies_pp_cumulative_exp["percent_profit"]
test_train_movies_pp_cumulative_exp.drop("percent_profit", axis = 1, inplace = True)

#Creating 4 df: 1: the training df with label removed, 2: the testing df with label removed, 3: the training label, 4: testing label
from sklearn.model_selection import train_test_split 
pp_exp_train, pp_exp_test, pp_exp_train_label, pp_exp_test_label = train_test_split(test_train_movies_pp_exp, test_train_movies_pp_exp_label, test_size = .3, random_state = 123)
pp_cumulative_train, pp_cumulative_test, pp_cumulative_train_label, pp_cumulative_test_label = train_test_split(test_train_movies_pp_cumulative, test_train_movies_pp_cumulative_label, test_size = .3, random_state = 123)
pp_cumulative_exp_train, pp_cumulative_exp_test, pp_cumulative_exp_train_label, pp_cumulative_exp_test_label = train_test_split(test_train_movies_pp_cumulative_exp, test_train_movies_pp_cumulative_exp_label, test_size = .3, random_state = 123)

#Getting a count of percent_profit in our test label  scores in out test label
#We want to have roughly the same number of positive and negative movies in our test df
Counter(pp_exp_test_label)

Counter({'negative': 61, 'positive': 58})

#Using the standard scale to help preprocess and normalize the data 
# performing preprocessing part 
sc = StandardScaler()
pp_exp_train = sc.fit_transform(pp_exp_train)
pp_exp_test = sc.transform(pp_exp_test)
pp_cumulative_train = sc.fit_transform(pp_cumulative_train)
pp_cumulative_test = sc.transform(pp_cumulative_test)
pp_cumulative_exp_train = sc.fit_transform(pp_cumulative_exp_train)
pp_cumulative_exp_test = sc.transform(pp_cumulative_exp_test)

#Attempt 1: all variables
clf = GaussianNB()
clf.fit(pp_exp_train, pp_exp_train_label)
test_predicted_exp_svm = clf.predict(pp_exp_test)

clf.fit(pp_cumulative_train, pp_cumulative_train_label)
test_predicted_cumulative_svm = clf.predict(pp_cumulative_test)

clf.fit(pp_cumulative_exp_train, pp_cumulative_exp_train_label)
test_predicted_cumulative_exp_svm = clf.predict(pp_cumulative_exp_test)

#Accuracy for exp 
exp_accuracy_svm = accuracy_score(pp_exp_test_label, test_predicted_exp_svm, normalize = True)
cm = confusion_matrix(pp_exp_test_label, test_predicted_exp_svm)
confusion_matrix_graph(cm, exp_accuracy_svm, "SVM Exp")

Text(0.5, 1, 'SVM Exp Accuracy Score: 0.5294')

print(classification_report(pp_exp_test_label, test_predicted_exp_svm))

              precision    recall  f1-score   support

    negative       0.54      0.56      0.55        61
    positive       0.52      0.50      0.51        58

    accuracy                           0.53       119
   macro avg       0.53      0.53      0.53       119
weighted avg       0.53      0.53      0.53       119

#Accuracy for cumulative
cum_accuracy_svm = accuracy_score(pp_cumulative_test_label, test_predicted_cumulative_svm, normalize = True)
cm = confusion_matrix(pp_cumulative_test_label, test_predicted_cumulative_svm)
confusion_matrix_graph(cm, cum_accuracy_svm, "SVM Cumulative")

Text(0.5, 1, 'SVM Cumulative Accuracy Score: 0.521')

#Accuracy for cumulative and exp 
exp_cum_accuracy_svm = accuracy_score(pp_cumulative_exp_test_label, test_predicted_cumulative_exp_svm, normalize = True)
cm = confusion_matrix(pp_cumulative_exp_test_label, test_predicted_cumulative_exp_svm)
confusion_matrix_graph(cm, exp_cum_accuracy_svm, "SVM Exp & Cumulative")

Text(0.5, 1, 'SVM Exp & Cumulative Accuracy Score: 0.4874')

###########################################################################################
# PCA and Logistic Regression 
###########################################################################################

#Fitting Logistic Regression 
classifier = LogisticRegression(random_state = 723)

classifier.fit(pp_exp_train, pp_exp_train_label)

#Predicting the test labels 
test_predicted_exp_lg = classifier.predict(pp_exp_test)

#getting the accuracy 
exp_accuracy_lg = accuracy_score(pp_exp_test_label, test_predicted_exp_lg, normalize = True)
cm = confusion_matrix(pp_exp_test_label, test_predicted_exp_lg)
confusion_matrix_graph(cm, exp_accuracy_lg, "LG Exp")

Text(0.5, 1, 'LG Exp Accuracy Score: 0.5126')

print(classification_report(pp_exp_test_label, test_predicted_exp_lg))

              precision    recall  f1-score   support

    negative       0.53      0.51      0.52        61
    positive       0.50      0.52      0.51        58

    accuracy                           0.51       119
   macro avg       0.51      0.51      0.51       119
weighted avg       0.51      0.51      0.51       119

#Fitting Logistic Regression 
classifier = LogisticRegression(random_state = 723)

classifier.fit(pp_cumulative_train, pp_cumulative_train_label)

#Predicting the test labels 
test_predicted_cum_lg = classifier.predict(pp_exp_test)

#getting the accuracy 
cum_accuracy_lg = accuracy_score(pp_cumulative_test_label, test_predicted_cum_lg, normalize = True)
cm = confusion_matrix(pp_cumulative_test_label, test_predicted_cum_lg)
confusion_matrix_graph(cm, cum_accuracy_lg, "LG Cumulative")

Text(0.5, 1, 'LG Cumulative Accuracy Score: 0.5378')

print(classification_report(pp_cumulative_test_label, test_predicted_cum_lg))

              precision    recall  f1-score   support

    negative       0.55      0.51      0.53        61
    positive       0.52      0.57      0.55        58

    accuracy                           0.54       119
   macro avg       0.54      0.54      0.54       119
weighted avg       0.54      0.54      0.54       119

#Fitting Logistic Regression 
classifier = LogisticRegression(random_state = 723)

classifier.fit(pp_cumulative_exp_train, pp_cumulative_exp_train_label)

#Predicting the test labels 
test_predicted_cum_exp_lg = classifier.predict(pp_cumulative_exp_test)

#getting the accuracy 
cum_exp_accuracy_lg = accuracy_score(pp_cumulative_exp_test_label, test_predicted_cum_exp_lg, normalize = True)
cm = confusion_matrix(pp_cumulative_exp_test_label, test_predicted_cum_exp_lg)
confusion_matrix_graph(cm, cum_exp_accuracy_lg, "LG Exp & Cumulative")

Text(0.5, 1, 'LG Exp & Cumulative Accuracy Score: 0.4454')

print(classification_report(pp_cumulative_exp_test_label, test_predicted_cum_exp_lg))

              precision    recall  f1-score   support

    negative       0.45      0.38      0.41        61
    positive       0.44      0.52      0.48        58

    accuracy                           0.45       119
   macro avg       0.45      0.45      0.44       119
weighted avg       0.45      0.45      0.44       119

###########################################################################################
# SVMS
###########################################################################################

#Cumulative SVM - Fitting the classifier 
svclassifier = SVC(kernel='rbf')
svclassifier.fit(pp_cumulative_train, pp_cumulative_train_label)
#Making the predictions 
test_predicted_cum_svm = svclassifier.predict(pp_cumulative_test)
#Creating my confusion matrix 
cum_accuracy_svm = accuracy_score(pp_cumulative_test_label, test_predicted_cum_svm, normalize = True)
cm = confusion_matrix(pp_cumulative_test_label, test_predicted_cum_svm)
confusion_matrix_graph(cm, cum_accuracy_svm, "SVM Cumulative")

Text(0.5, 1, 'SVM Cumulative Accuracy Score: 0.5042')

print(classification_report(pp_cumulative_test_label, test_predicted_cum_svm))

              precision    recall  f1-score   support

    negative       0.52      0.41      0.46        61
    positive       0.49      0.60      0.54        58

    accuracy                           0.50       119
   macro avg       0.51      0.51      0.50       119
weighted avg       0.51      0.50      0.50       119

#Exp SVM - Fitting the classifier 
svclassifier = SVC(kernel='linear')
svclassifier.fit(pp_exp_train, pp_exp_train_label)
#Making the predictions 
test_predicted_exp_svm = svclassifier.predict(pp_exp_test)
#Creating my confusion matrix 
exp_accuracy_svm = accuracy_score(pp_exp_test_label, test_predicted_exp_svm, normalize = True)
cm = confusion_matrix(pp_exp_test_label, test_predicted_exp_svm)
confusion_matrix_graph(cm, exp_accuracy_svm, "SVM Exp")

Text(0.5, 1, 'SVM Exp Accuracy Score: 0.5714')

print(classification_report(pp_exp_test_label, test_predicted_exp_svm))

              precision    recall  f1-score   support

    negative       0.57      0.64      0.60        61
    positive       0.57      0.50      0.53        58

    accuracy                           0.57       119
   macro avg       0.57      0.57      0.57       119
weighted avg       0.57      0.57      0.57       119

#Exp & Cum SVM - Fitting the classifier 
svclassifier = SVC(kernel='rbf')
svclassifier.fit(pp_cumulative_exp_train, pp_cumulative_exp_train_label)
#Making the predictions 
test_predicted_cum_exp_svm = svclassifier.predict(pp_cumulative_exp_test)
#Creating my confusion matrix 
cum_exp_accuracy_svm = accuracy_score(pp_cumulative_exp_test_label, test_predicted_cum_exp_svm, 
                                      normalize = True)
cm = confusion_matrix(pp_cumulative_exp_test_label, test_predicted_cum_exp_svm)
confusion_matrix_graph(cm, cum_exp_accuracy_svm, "SVM Exp & Cumulative")

Text(0.5, 1, 'SVM Exp & Cumulative Accuracy Score: 0.479')

print(classification_report(pp_cumulative_exp_test_label, test_predicted_cum_exp_svm))

              precision    recall  f1-score   support

    negative       0.43      0.38      0.40        61
    positive       0.42      0.47      0.44        58

    accuracy                           0.42       119
   macro avg       0.42      0.42      0.42       119
weighted avg       0.42      0.42      0.42       119

# We had much more success when predicting score versus percent profit... However, the SVM with Linear Kernel on 
#the exponential df did have a 57% accuracy.
#We believe that in order to have a better accuracy when predicting percent profit, we need to have more data. Our 
#next step is to find more data and then use the information gained from this analysis on our new data.

	budget	company	country	director	genre	gross	name	rating	released	runtime	score	star	votes	writer	year
0	8000000	Columbia Pictures Corporation	USA	Rob Reiner	Adventure	52287414	Stand by Me	R	8/22/1986	89	8.1	Wil Wheaton	299174	Stephen King	1986
1	6000000	Paramount Pictures	USA	John Hughes	Comedy	70136369	Ferris Bueller's Day Off	PG-13	6/11/1986	103	7.8	Matthew Broderick	264740	John Hughes	1986
2	15000000	Paramount Pictures	USA	Tony Scott	Action	179800601	Top Gun	PG	5/16/1986	110	6.9	Tom Cruise	236909	Jim Cash	1986
3	18500000	Twentieth Century Fox Film Corporation	USA	James Cameron	Action	85160248	Aliens	R	7/18/1986	137	8.4	Sigourney Weaver	540152	James Cameron	1986
4	9000000	Walt Disney Pictures	USA	Randal Kleiser	Adventure	18564613	Flight of the Navigator	PG	8/1/1986	90	6.9	Joey Cramer	36636	Mark H. Baker	1986

	budget	company	country	director	genre	gross	name	rating	released	runtime	score	star	votes	writer	year	month	day	profit	percent_profit
0	8000000	Columbia Pictures Corporation	USA	Rob Reiner	Adventure	52287414	Stand by Me	R	1986-08-22	89	8.1	Wil Wheaton	299174	Stephen King	1986	8	22	44287414	553.592675
1	6000000	Paramount Pictures	USA	John Hughes	Comedy	70136369	Ferris Bueller's Day Off	PG-13	1986-06-11	103	7.8	Matthew Broderick	264740	John Hughes	1986	6	11	64136369	1068.939483
2	15000000	Paramount Pictures	USA	Tony Scott	Action	179800601	Top Gun	PG	1986-05-16	110	6.9	Tom Cruise	236909	Jim Cash	1986	5	16	164800601	1098.670673
3	18500000	Twentieth Century Fox Film Corporation	USA	James Cameron	Action	85160248	Aliens	R	1986-07-18	137	8.4	Sigourney Weaver	540152	James Cameron	1986	7	18	66660248	360.325665
4	9000000	Walt Disney Pictures	USA	Randal Kleiser	Adventure	18564613	Flight of the Navigator	PG	1986-08-01	90	6.9	Joey Cramer	36636	Mark H. Baker	1986	8	1	9564613	106.273478

	budget	company	country	director	genre	gross	name	rating	released	runtime	...	profit	percent_profit	cumulative_mean_writer	exp_mean_writer	cumulative_mean_company	exp_mean_company
0	8000000	Columbia Pictures Corporation	USA	Rob Reiner	Adventure	52287414	Stand by Me	R	1986-08-22	89	...	44287414	553.592675	5.4	5.4	5.95	6.265854
1	6000000	Paramount Pictures	USA	John Hughes	Comedy	70136369	Ferris Bueller's Day Off	PG-13	1986-06-11	103	...	64136369	1068.939483	6.8	6.8	6.10	6.641843
2	15000000	Paramount Pictures	USA	Tony Scott	Action	179800601	Top Gun	PG	1986-05-16	110	...	164800601	1098.670673	0.0	0.0	5.90	6.628604
3	18500000	Twentieth Century Fox Film Corporation	USA	James Cameron	Action	85160248	Aliens	R	1986-07-18	137	...	66660248	360.325665	0.0	0.0	7.05	6.824390
4	9000000	Walt Disney Pictures	USA	Randal Kleiser	Adventure	18564613	Flight of the Navigator	PG	1986-08-01	90	...	9564613	106.273478	0.0	0.0	7.20	7.200000

	budget	company	country	director	genre	gross	name	rating	released	runtime	...	profit	percent_profit	cumulative_mean_director	exp_mean_director	cumulative_mean_writer	exp_mean_writer	cumulative_mean_star	exp_mean_star	cumulative_mean_company	exp_mean_company
110	30000000	Paramount Pictures	USA	John Hughes	Comedy	49530280	Planes, Trains & Automobiles	R	1987-11-25	93	...	19530280	65.100933	7.8	7.8	7.3	6.84878	6.5	6.409756	6.428571	6.633831
116	28000000	Paramount Pictures	USA	Tony Scott	Action	153665000	Beverly Hills Cop II	R	1987-05-20	100	...	125665000	448.803571	6.9	6.9	6.2	6.20000	5.9	5.900000	6.245455	6.557347
139	16000000	Orion Pictures	USA	Woody Allen	Comedy	14792779	Radio Days	PG	1987-01-30	88	...	-1207221	-7.545131	8.0	8.0	8.0	8.00000	8.0	8.000000	7.100000	6.755443
151	350000	Cannon Group	USA	Sam Firstenberg	Action	4000000	American Ninja 2: The Confrontation	R	1987-05-01	90	...	3650000	1042.857143	5.9	5.9	5.7	5.70000	5.9	5.900000	5.350000	5.034146
213	6000000	Paramount Pictures	Canada	Jonathan Kaplan	Drama	32069318	Acusados	R	1988-10-14	111	...	26069318	434.488633	6.1	6.1	6.7	6.70000	6.2	6.200000	6.377778	6.546875

	budget	company	country	director	genre	gross	name	rating	released	runtime	...	cumulative_mean_director	exp_mean_director	cumulative_mean_writer	exp_mean_writer	cumulative_mean_star	exp_mean_star	cumulative_mean_company	exp_mean_company	cumulative_mean_avg	exp_mean_avg
110	30000000	Paramount Pictures	USA	John Hughes	Comedy	49530280	Planes, Trains & Automobiles	R	1987-11-25	93	...	7.8	7.8	7.3	6.84878	6.5	6.409756	6.428571	6.633831	7.007143	6.923092
116	28000000	Paramount Pictures	USA	Tony Scott	Action	153665000	Beverly Hills Cop II	R	1987-05-20	100	...	6.9	6.9	6.2	6.20000	5.9	5.900000	6.245455	6.557347	6.311364	6.389337
139	16000000	Orion Pictures	USA	Woody Allen	Comedy	14792779	Radio Days	PG	1987-01-30	88	...	8.0	8.0	8.0	8.00000	8.0	8.000000	7.100000	6.755443	7.775000	7.688861
151	350000	Cannon Group	USA	Sam Firstenberg	Action	4000000	American Ninja 2: The Confrontation	R	1987-05-01	90	...	5.9	5.9	5.7	5.70000	5.9	5.900000	5.350000	5.034146	5.712500	5.633537
213	6000000	Paramount Pictures	Canada	Jonathan Kaplan	Drama	32069318	Acusados	R	1988-10-14	111	...	6.1	6.1	6.7	6.70000	6.2	6.200000	6.377778	6.546875	6.344444	6.386719

	count	genre
0	301	Action
1	54	Adventure
2	33	Animation
3	38	Biography
4	233	Comedy
5	69	Crime
6	116	Drama
7	4	Fantasy
8	27	Horror
9	7	Mystery
10	1	Romance

	count	star
0	2	Aaron Eckhart
1	2	Aaron Taylor-Johnson
2	12	Adam Sandler
3	3	Al Pacino
4	1	Albert Brooks

	count
count	356.000000
mean	2.480337
std	2.290742
min	1.000000
25%	1.000000
50%	2.000000
75%	3.000000
max	12.000000

	number_of_companies	number_of_movies
0	99	1
1	25	2
2	11	3
3	6	4
4	2	5
5	1	6
6	2	7
7	1	8
8	2	9
9	2	11
10	2	12
11	2	13
12	1	14
13	1	15
14	2	17
15	1	18
16	2	21
17	1	28
18	1	34
19	2	50
20	1	72
21	1	90
22	1	102

	count	year
0	4	1987
1	3	1988
2	5	1989
3	5	1990
4	10	1991
5	16	1992
6	18	1993
7	21	1994
8	17	1995
9	18	1996
10	18	1997
11	20	1998
12	26	1999
13	25	2000
14	30	2001
15	35	2002
16	32	2003
17	45	2004
18	38	2005
19	48	2006
20	42	2007
21	33	2008
22	47	2009
23	49	2010
24	51	2011
25	43	2012
26	50	2013
27	38	2014
28	42	2015
29	54	2016

	count	month
0	63	Jan
1	47	Feb
2	55	Mar
3	58	Apr
4	81	May
5	79	Jun
6	78	Jul
7	81	Aug
8	57	Sept
9	86	Oct
10	96	Nov
11	102	Dec

	count	genre	percent_profit
0	150	Action	negative
1	60	Action	low
2	62	Action	high
3	29	Action	extremely_high
4	16	Adventure	negative

	count	genre	percent_profit	genre_count	percent
0	150	Action	negative	301	49.833887
1	60	Action	low	301	19.933555
2	62	Action	high	301	20.598007
3	29	Action	extremely_high	301	9.634551
4	16	Adventure	negative	54	29.629630

	number_of_movies	director_count
0	257	1
1	94	2
2	48	3
3	22	4
4	15	5
5	11	6
6	4	7
7	2	9
8	1	19

	number_of_companies	number_of_movies
0	99	1
1	25	2
2	11	3
3	6	4
4	2	5
5	1	6
6	2	7
7	1	8
8	2	9
9	2	11
10	2	12
11	2	13
12	1	14
13	1	15
14	2	17
15	1	18
16	2	21
17	1	28
18	1	34
19	2	50
20	1	72
21	1	90
22	1	102

	number_of_companies	number_of_movies
0	99	1
1	25	2
2	11	3
3	6	4
4	2	5
5	1	6
6	2	7
7	1	8
8	2	9
9	2	11
10	2	12
11	2	13
12	1	14
13	1	15
14	2	17
15	1	18
16	2	21
17	1	28
18	1	34
19	2	50
20	1	72
21	1	90
22	1	102