from bs4 import BeautifulSoup
import json
import csv
import pandas as pd
from urllib.parse import quote
import requests
import time
import re
Data from kaggle.com
kaggle = pd.read_csv("movies.csv", encoding = "ISO-8859-1")
len(kaggle)
kaggle_cleaned = pd.read_csv("working_movies_usa.csv", encoding = "ISO-8859-1" )
len(kaggle_cleaned)
Data from scraping imdb.com (see below for details)
imdb_707 = pd.read_csv("movies_IST707.csv", encoding = "ISO-8859-1" )
len(imdb_707)
imdb_ids = pd.read_csv("ids_from_imdb.csv", encoding = "ISO-8859-1" )
len(imdb_ids)
imdb_scores = pd.read_csv("V2_IMDB_score_data.csv", encoding = "ISO-8859-1")
len(imdb_scores)
Data from hitting the TMDB api (see below for details)
# First attempt
tmdb_movies = pd.read_csv("tmdb_movies_csv.csv", encoding = "ISO-8859-1")
len(tmdb_movies)
# Taking every actor in our kaggle dataset and getting their movie credits using TMDB's api
tmdb_actors = pd.read_csv("tmdb_20k.csv", encoding = "ISO-8859-1")
len(tmdb_actors)
# Removing things without budget, revenue, production studio or genre
tmdb_actors_cleaned = pd.read_csv("tmdb_20k_cleaned.csv", encoding = "ISO-8859-1")
len(tmdb_actors_cleaned)
tmdb_v2 = pd.read_csv("tmdb_from_imdb_v2.csv", encoding = "ISO-8859-1")
len(tmdb_v2)
Data from scraping the-numbers.com (see below for details)
tn_1 = pd.read_csv("V2_TN_budget_data_and_url.csv", encoding = "ISO-8859-1")
len(tn_1)
tn_2 = pd.read_csv("V2_TN_reports_dates.csv", encoding = "ISO-8859-1")
len(tn_2)
# ==============================================================
# GETTING THE IMDB ID & SCORE
# ==============================================================
def get_info_from_movies(movies):
ids_for_movies_in_year = []
for i, movie in enumerate(movies):
link_with_id = movie.find('a', {'href': re.compile('/title/tt')})
imdb_id = link_with_id.attrs['href'].split('/')[2]
clean = "".join(line.strip() for line in movie.text.split("\n"))
# valiant regex attempt
# rating = re.compile('\)(.*)')
# name = re.compile('\..*\ ')
# date = re.compile('(\d{3}).')
title_rating_string = clean.split('0Rate')[0]
rating = title_rating_string.split(')')[1]
name = title_rating_string.split('.')[1].split('(')[0]
date = title_rating_string.split('(')[1].split(')')[0]
movie_dict = {
'imdb_id': imdb_id,
'name': name,
'imdb_rating': rating,
'date': date
}
ids_for_movies_in_year.append(movie_dict)
return(ids_for_movies_in_year)
def get_imdb_html(year, urlending):
url = ('https://www.imdb.com/search/title/?title_type=feature&boxoffice_gross_us=1,&release_date='+str(year)+'-01-01,'+str(year)+'-12-31&countries=us&view=simple&count=250'+urlending)
headers = {'Accept-Language': 'en-US'}
movies_html = requests.get(url.format(), headers=headers).content
soup = BeautifulSoup(movies_html, 'html.parser')
soup_main = soup.find("div", {"id": "main"})
movies_list = soup_main.find('div', class_="lister list detail sub-list")
movies = soup_main.find_all('div', class_="lister-item mode-simple")
return movies
def get_imdb_scores(year, urlending):
movies = get_imdb_html(year, urlending)
return get_info_from_movies(movies)
def run_imdb_scores_script():
all_the_ids = []
for year in range(1970, 2020):
all_the_ids += get_imdb_scores(year, '')
all_the_ids += get_imdb_scores(year, '&start=251')
all_the_ids_df = pd.DataFrame(all_the_ids)
all_the_ids_df.to_csv('imdb_ids.csv')
# save a small (2018) subset
# all_the_2018_ids_df = pd.DataFrame(all_the_ids[48])
# all_the_2018_ids_df.to_csv
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # UNCOMMENT TO RUN <3
# run_imdb_scores_script()
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# ==============================================================
# GETTING THE SUPPLIMENTAL INFO FROM TMBD API
# ==============================================================
# NOTE: This is a fun time capsule of how we wrote these files
# at the start of this project!! We'd like to think we've
# cleaned things up a bit!!
def write_csv(data):
df = pd.DataFrame(data)
df.to_csv('2018_movies.csv', index=False)
headers = {'Accept-Language': 'en-US'}
payload = "{}"
api_key = open("tmdb_api_key.txt")
api_key = api_key.read()
def get_tmdb_info(imdb_id_file):
all_movie_data = []
with open(imdb_id_file, encoding='utf-8') as csvfile:
movies = csv.reader(csvfile)
for movie in movies:
try:
url = "https://api.themoviedb.org/3/movie/"
thing_looking_for = movie[1]
my_api_key = "?api_key=" + api_key
full_url = url + thing_looking_for + my_api_key
res = requests.get(full_url, payload, headers=headers)
data = res.content.decode('UTF-8')
jdata = json.loads(data)
try:
title = jdata['title']
budget = jdata['budget']
genres = jdata['genres']
production_companies = jdata['production_companies']
release_date = jdata['release_date']
revenue = jdata['revenue']
profit = revenue - budget
popularity = jdata['popularity']
vote_average = jdata['vote_average']
vote_count = jdata['vote_count']
except KeyError:
title = 'NA'
budget = 'NA'
genres = 'NA'
production_companies = 'NA'
release_date = 'NA'
revenue = 'NA'
profit = 'NA'
popularity = 'NA'
vote_average = 'NA'
vote_count = 'NA'
movie_data = {
'release_date': release_date,
'title': title,
'budget': budget,
'genres': genres,
'production_companies': production_companies,
'revenue': revenue,
'profit': profit,
'popularity': popularity,
'vote_average': vote_average,
'vote_count': vote_count
}
all_movie_data.append(movie_data)
except UnicodeDecodeError:
director_data = {}
all_movie_data_df = pd.DataFrame(all_movie_data)
print(all_movie_data_df)
write_csv(all_movie_data)
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # UNCOMMENT TO RUN <3
# get_tmdb_info('imdb_ids_2018.csv')
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# 1. get the page (get_tn_data)
# 2. get the soup (get_soup)
# 3. get the data from the soup (get_data_from_soup)
def get_data_from_soup(soup_data):
all_movies = []
for data in soup_data[1:]:
data_array = data.text.split('\n')
movie_data = {
'num': data_array[0],
'release_date': data_array[1],
'name': data_array[2],
'production_budget': data_array[3],
'domestic_gross': data_array[4],
'worldwide_gross': data_array[5]
}
all_movies.append(movie_data)
return all_movies
def get_soup(num):
url = ('https://www.the-numbers.com/movie/budgets/all'+num)
headers = {'Accept-Language': 'en-US'}
movies_html = requests.get(url.format(), headers=headers).content
soup = BeautifulSoup(movies_html, 'html.parser')
soup_data = soup.find_all("tr")
return soup_data
def get_tn_data(num):
tn_soup = get_soup(num)
return get_data_from_soup(tn_soup)
# 1. create array to house all data
# 2. quick workaround for our first page
# 3. iterate through the the-numbers url
# 4. save to df, save to csv
def run_TN_script():
all_pages = []
all_pages += get_tn_data('')
for i in range(1,59):
results = get_tn_data('/'+ str(i) + '01')
all_pages += results
all_pages_df = pd.DataFrame(all_pages)
all_pages_df.to_csv('TN_budget_data.csv')
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # UNCOMMENT TO RUN <3
# run_TN_script()
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
def format_data(data_array):
movie_data = {
'Released': data_array[2],
'ReleasedWorldwide': data_array[3],
'Released_2': data_array[4],
'ReleasedWorldwide_2': data_array[5],
'Title': data_array[6],
'TheatricalDistributor': data_array[7],
'Genre': data_array[8],
'Source': data_array[9],
'ProductionMethod': data_array[10],
'CreativeType': data_array[11],
'ProductionBudget': data_array[12],
'OpeningWeekendTheaters': data_array[13],
'MaximumTheaters': data_array[14],
'TheatricalEngagements': data_array[15],
'OpeningWeekendRevenue': data_array[16],
'DomesticBoxOffice': data_array[17],
'Infl.Adj.Dom.BoxOffice': data_array[18],
'InternationalBoxOffice': data_array[19],
'WorldwideBoxOffice': data_array[20]
}
return movie_data
def get_report(year):
url = ('https://www.the-numbers.com/movies/report/All/All/All/All/All/All/All/All/All/.1/None/'+str(year)+'/'+ str(year + 1)+'/None/None/None/None/None/None?show-release-date=On&view-order-by=domestic-box-office&show-release-year=On&view-order-direction=desc&show-production-budget=On&show-opening-weekend-theaters=On&show-domestic-box-office=On&show-maximum-theaters=On&show-inflation-adjusted-domestic-box-office=On&show-theatrical-engagements=On&show-international-box-office=On&show-opening-weekend-revenue=On&show-worldwide-box-office=On&show-worldwide-release-date=On&show-worldwide-release-year=On&show-theatrical-distributor=On&show-genre=On&show-source=On&show-production-method=On&show-creative-type=On')
headers = {'Accept-Language': 'en-US'}
movies_html = requests.get(url.format(), headers=headers).content
soup = BeautifulSoup(movies_html, 'html.parser')
soup_data = soup.find_all("tr")
all_movies = []
for data in soup_data[1:]:
data_array = data.text.split('\n')
try:
url = data.find_all('a')[0]
cast_data = get_cast(url)
# Saving summary data for V2
# summary_data = get_summary(url)
data_object = format_data(data_array)
data_object.update(cast_data)
all_movies.append(data_object)
except:
print('no report')
return all_movies
def get_summary(url):
url = 'https://www.the-numbers.com' + url.attrs['href']
headers = {'Accept-Language': 'en-US'}
movies_html = requests.get(url.format(), headers=headers).content
soup = BeautifulSoup(movies_html, 'html.parser')
soup_main = soup.find("div", {"id": "summary"})
return "coming soon"
def get_cast(url):
url = 'https://www.the-numbers.com' + url.attrs['href'].split("#")[0]+"#tab=cast-and-crew"
headers = {'Accept-Language': 'en-US'}
movies_html = requests.get(url.format(), headers=headers).content
soup = BeautifulSoup(movies_html, 'html.parser')
soup_main = soup.find("div", {"id": "cast-and-crew"})
soup_data = soup_main.find_all("div", class_="cast_new")
cast_data = {}
leads = []
supporting = []
production = []
for data in soup_data:
if 'Lead' in data.h1.text:
cast = data.find_all("tr")
for castmember in cast:
leads.append(castmember.text.strip().split('\n')[0])
if 'Supporting' in data.h1.text:
cast = data.find_all("tr")
for castmember in cast:
supporting.append(castmember.text.strip().split('\n')[0])
if 'Production' in data.h1.text:
cast = data.find_all("tr")
for castmember in cast:
production.append({castmember.text.strip().split('\n')[2]: castmember.text.strip().split('\n')[0]})
cast_data.update({'star': leads[0]})
cast_data.update({'leads': leads})
cast_data.update({'supporting': supporting})
cast_data.update({'production': production})
return cast_data
def run_TN_reports_script():
all_pages = []
for year in range(2000,2020):
results = get_report(year)
all_pages += results
pd.DataFrame(all_pages).to_csv('TN_reports_data.csv')
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # UNCOMMENT TO RUN <3
# run_TN_reports_script()
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
def clean_merged_file(merged_file):
big_movies = pd.read_csv(merged_file, encoding = "ISO-8859-1")
big_movies_clean = pd.DataFrame({
"id": big_movies['id'],
"imdb_id": big_movies['imdb_id'],
"name": big_movies['name'],
"budget": big_movies['budget'],
"revenue": big_movies['revenue'],
"runtime": big_movies['runtime'],
"score": big_movies['score'],
"vote_count": big_movies['vote_count'],
"released": big_movies['released'],
"tagline": big_movies['tagline'],
"production_companies": big_movies['production_companies'],
"genres": big_movies['genres']
})
return big_movies_clean
def get_all_from_list(list_of_things, num, key_to_get):
if list_of_things == '[]':
return 'na'
else:
try:
return eval(list_of_things)[num][key_to_get]
except:
return eval(list_of_things)[0][key_to_get]
# NOTE: This section only worked for some iterations of our data
# A clear sign we must refactor!
def widen_df(big_movies_clean):
# the slash at the end of the line is so we can split it into two lines
# PRODUCTION COMPANIES
big_movies_clean['production_company_1'] = big_movies.apply \
(lambda x: get_all_from_list(x['production_companies'], 0, 'name'),axis=1)
big_movies_clean['production_company_2'] = big_movies.apply \
(lambda x: get_all_from_list(x['production_companies'], 1, 'name'),axis=1)
big_movies_clean['production_company_3'] = big_movies.apply \
(lambda x: get_all_from_list(x['production_companies'], 2, 'name'),axis=1)
# # GENRES
big_movies_clean['genre_1'] = big_movies.apply \
(lambda x: get_all_from_list(x['genres'], 0, 'name'),axis=1)
big_movies_clean['genre_2'] = big_movies.apply \
(lambda x: get_all_from_list(x['genres'], 1, 'name'),axis=1)
big_movies_clean['genre_3'] = big_movies.apply \
(lambda x: get_all_from_list(x['genres'], 2, 'name'),axis=1)
def run_clean_file_script():
big_movies_clean = clean_merged_file("testing_first_merge.csv")
big_movies_clean_v2 = widen_df(big_movies_clean)
big_movies_clean_v2.to_csv('big_movies_clean_v2.csv')
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# # UNCOMMENT TO RUN <3
# run_clean_file_script()
# ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatchesphew
# ======================================================================
# DOING EXPLORATORY DATA ANALYSIS ON A SMALL SUBSET (2018) OF THE DATA
# ======================================================================
# NOTE: The comments in this section are a nice melding of Ali practicality
# and Kendra color. Please do not fault Ali for the bad puns and word play
# -------------------------------------
# BUT FIRST, WE PREP!!
# -------------------------------------
# STEP 1: readin' and cleanin'
movies = pd.read_csv('2018_movies.csv')
movies.head()
movies.shape
# just say nah to na
# droping the first row of NaNs
movies = movies.drop([0,])
movies.shape
# dropping na and NaN in place
movies.dropna(inplace = True)
# STEP 2: droppin' and removin'
# say bye bye to those without budget
# (removing movies without the information we need)
index_names = movies[movies["budget"] == 0].index
# We can see that 325 movies in our df have a budget of 0 dollars...
# We have to drop these movies
index_names
# Repeating above... but with revenue
movies.drop(index_names, inplace = True)
index_names = movies[movies["revenue"] == 0].index
movies.drop(index_names, inplace = True)
# STEP 3: formattin' and finessin'
# Type-casting isn't just for Hollywood
# Checking to see the data type for the release_date column
movies.release_date.dtype
#It shows that it is saved as an object, we want to convert this to date format
#Changing the data type to date by using the pd_to_datetime function, this will allow us to extract each element of the date
movies["release_date"] = pd.to_datetime(movies["release_date"])
#now we want to extract the month, day, and year and create new columns named month, day and year
movies["month"], movies["day"], movies["year"] = movies["release_date"].dt.month, movies["release_date"].dt.day, movies["release_date"].dt.year
# STEP 4: surmisin' and summarisin'
# Getting summary statistics for our df
movies.describe()
It's interesting to note that the lowest budget for a movie in our df is only 258,157 and the maximum budget is 300,000,000. The lowest revenue is 4,537. The maximum revenue is 2,046,240,000. This is a massive revenue. We aggregated a profit column and the minimum profit is - 60,477,350 and maximum profit is 1,746,240,000. This is a massive profit. Their is a large range in popularity scores. The maximum popularity score is 71.54 and the minimum is 3.54. The mean popularity is 18.4 and 75% of the movies have a popularity score less than 21.2. This makes us question if the maximum popularity score might be an error, or it might correspond to the movie with the highest profit. We should investigate what movie this score references. The vote_average column has a range of 4 - 8.4, with an average of 6.5. The vote_count has a maximum of 14,913. This might reference the same movie that had the largest popularity. The max value in this column, also appears to be an outlier, as 75% of the movies have less than 2151 votes and the average vote count is 1701
# STEP 5: column creatin'
#We have decided that we want to aggregate a percent profit column as well, in an attempt to normalize the data
#To do this we are diving the profit column by the budget column and multiply the result by 100 and saving it in a
#new column named percent_profit
movies["percent_profit"] = movies["profit"]/movies["budget"]*100
# Saving a column as-is for future use
movies_original_df = movies
movies.budget.describe()
First, we are going to discretize the budget column. We are discretizing the budget column into four groups: extremely_low, low, high and extremely_high. To do this we first need to create a list of the categories
categories = ["extremely_low", "low", "high", "extremely_high"]
Now we need to show where we want to insert the breaks. We have decided that extremely low budgets are budgets less than 13,000,000, low have budgets between 13,000,000 and 30,000,000, high have budgets between 30,000,000 and 62,192,550, and extremely_high have budgets between 62,192,550 and 300,000,000. We chose the values based on the quartiles.
#Saving the movies df as movies_discretized_df
movies_discretized_df = movies
#Discretizing the budget columns using the cut function from pandas
movies_discretized_df["budget"] = pd.cut(movies_discretized_df["budget"], [0, 13000000, 30000000, 62192550, 300000001], labels = categories)
#Now we are going to repeat the steps to discretize the revenue column
movies_discretized_df.revenue.describe()
#We are using the same categories as above
extremely_low
revenue are revenues less than 21,458,200, low
are revenues between 21,458,200 and
62,954,020, high
revenues are revenues between 62,954,020 and 187,976,900, and extremely_high
revenues between
187,976,900 and 2,046,240,000.
movies_discretized_df["revenue"] = pd.cut(movies_discretized_df["revenue"], [0, 21458200, 62954020, 187976900, 2046240001], labels = categories)
#Now we are going to repeat the steps to discretized the profit column
movies_discretized_df.profit.describe()
#Now we are going to repeat the steps to discretized the profit column
movies_discretized_df.profit.describe()
'''negative profit are profits less than $0, low profits are profits between $0 and
$29,314,900, high profits are profits between $29,314,900 and $140,784,100, and extremely_high profits between
$140,784,100 and $1,746,240,001.
'''
categories = ["negative", "low", "high", "extremely_high"]
movies_discretized_df["profit"] = pd.cut(movies_discretized_df["profit"], [-60477351, 0, 29314900, 140784100, 1746240001], labels = categories)
movies_discretized_df.profit.dtype
#Now we are going to repeat the steps to discretize the popularity column
movies_discretized_df.popularity.describe()
categories = ["extremely_low", "low", "high", "extremely_high"]
'''extremely_low popularity are popularities less than 12.442, low popularities are popularities between 12.442 and
15.7405, high popularity are popularities between 15.7405 and 21.23025 and extremely_high popularity between 21.23025
and 71.538'''
movies_discretized_df["popularity"] = pd.cut(movies_discretized_df["popularity"], [0, 12.442, 15.7405, 21.23025, 71.538], labels = categories)
movies_discretized_df["popularity"].head()
#Now we are going to repeat the steps to discretize the vote avg
movies_discretized_df.vote_average.describe()
#We are using the same categories as above
'''extremely_low vote_average are vote averages less than 6, low are between 6 to 6.5, high between 6.5 and 7 and
extremely_high 7 and 8.5'''
movies_discretized_df["vote_average"] = pd.cut(movies_discretized_df["vote_average"], [0, 6, 6.5, 7, 8.5], labels = categories)
movies_discretized_df["vote_average"].head()
#We are using the same categories as above
'''extremely_low vote counts are vote counts less than 440, low are between 440 and 1151, high are between 1151 and 2522
and extremely_high are between 2522 and 14913'''
movies_discretized_df["vote_count"] = pd.cut(movies_discretized_df["vote_count"], [0, 440, 1151, 2522, 14914], labels = categories)
movies_discretized_df["vote_count"].head()
movies_discretized_df.percent_profit.describe()
'''extremely_low are percent profits between -100 and 0, low between 6.5 and 108, high between 108 and 436 and
extremely_high between 436 and 6527'''
categories = ["negative", "low", "high", "extremely_high"]
movies_discretized_df["percent_profit"] = pd.cut(movies_discretized_df["percent_profit"], [-100, 0, 108, 436, 6528], labels = categories )
movies_discretized_df["percent_profit"]
movies_discretized_df.day.describe()
#We are setting new categories for the day column by creating a new column for week
'''week_1 is the first 7 days of the month, week_2 is days 8 - 14, week_3 is days 15 - 21, and week_4 are the
rest of the days'''
categories = ["week_1", "week_2", "week_3", "week_4"]
movies_discretized_df["week"] = pd.cut(movies_discretized_df["day"], [0, 8, 15, 22, 32], labels = categories)
movies_discretized_df.head()
#We have successfully discretized the df, now we can remove the day and release_date column
movies_discretized_df.drop(columns=['day', 'release_date'], inplace = True)
#Checking to make sure that it worked
movies_discretized_df.head()
#Question 1:
#How are the amounts of percent_profits distributed across budget levels?
'''We want to compare the budget category percentage make up for each percent_profit level. To do this we need to
get the count for each budget level, the count for each percent_profit level by budget level and then divide
the count of the percent_profit/count of budget level and multiply by 100. We have to do this for each
budget level and level of percent_profits. We think that we could potentially answer this question by group bys.'''
movies_discretized_count = movies_discretized_df.groupby(["budget", "percent_profit"])["budget"].count()
'''Taking the output from the line above and converting it to a data frame. We are using pandas, which we important as pd.
First, we call the package we are using then the function from that package and then what we want to run the function on.
pd.function(item to use). We are using the DataFrame function from the pandas package on the series created by our group by'''
movies_discretized_count_df = pd.DataFrame(movies_discretized_count)
#Checking to see what our df looks like.
movies_discretized_count_df
#Changing the column name from budget to counts
movies_discretized_count_df.columns = ["counts"]
#Checking to see what our df looks like.
movies_discretized_count_df
#We want to get a total count for the number of percent_profit counts for each budget level. We will experiment to see how this is possible
'''This shows that we have 2 indexes budget and percent_profit... We want to create columns from each index
We are creating a new column named budget by extracting the values from the first index (0) which is the budget
index'''
movies_discretized_count_df["budget_category"]=movies_discretized_count_df.index.get_level_values(0)
#We are creating a new column named total_donations by extracting the values from the second index (1) which is total_donations
movies_discretized_count_df["percent_profit_category"] = movies_discretized_count_df.index.get_level_values(1)
#Checking to make sure it worked...
movies_discretized_count_df
#It did!
#Now we want to remove the indexes so, we can create a new group by to get the sum of the counts for each group
#To do this we are using the reset_index(drop = True) This will drop our group by indexes and allow us to create a new one.
movies_discretized_count_df = movies_discretized_count_df.reset_index(drop = True)
movies_discretized_count_df
#Now we are getting the sum of each budget category.
budget_discretized_count_df = movies_discretized_count_df.groupby(["budget_category"])["counts"].sum()
#Checking the results
budget_discretized_count_df
'''We ultimately want a column that contains the total counts for each budget group. We are probably doing this in
a roundabout way, but as I am extremely new to python this is the best way I can think of doing it. We are going to create
a new column that replicates the income_level called income_level_count and then we will use the replace function to
replace the 1s with their total count, the 2s with their total count... '''
#First, replicating the income level column in a column named budget_category_count
movies_discretized_count_df["budget_category_count"] = movies_discretized_count_df["budget_category"]
#Now replacing the income level with the total count for each income level
movies_discretized_count_df["budget_category_count"] = movies_discretized_count_df["budget_category_count"].replace(["extremely_low"], 35)
movies_discretized_count_df["budget_category_count"] = movies_discretized_count_df["budget_category_count"].replace(["low"], 35)
movies_discretized_count_df["budget_category_count"] = movies_discretized_count_df["budget_category_count"].replace(["high"], 32)
movies_discretized_count_df["budget_category_count"] = movies_discretized_count_df["budget_category_count"].replace(["extremely_high"], 34)
#Checking to see if that worked:
movies_discretized_count_df
#Okay, we are one step closer... Now, we need to create a column that takes the counts/budget_category_counts * 100
movies_discretized_count_df["percent"] = movies_discretized_count_df["counts"]/movies_discretized_count_df["budget_category_count"] *100
#Looking at our data frame... It worked!!!
movies_discretized_count_df
#We no longer need the count columns
movies_discretized_count_df.drop(["counts", "budget_category_count"], axis = 1, inplace = True )
'''Attempting to graph this data using a grouped bar chart:
formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"],
title = "you can set the title of your graph here")'''
graph = movies_discretized_count_df.pivot("budget_category", "percent_profit_category",
"percent").plot(kind="bar", color = ["crimson", "salmon", "palegreen", "darkgreen"],
title = "Percent of Percent Profit to Budget Category")
#Changing the y label of our graph to Percent
plt.ylabel("Percent")
#Changing the x axis label of our graph to Budget Category
plt.xlabel("Budget Category")
#Making it so the tick labels are not angled
plt.xticks(rotation = 0)
#How to change the tick labels (we ended up not needing this, but want to keep for future reference)
#plt.Axes.set_xticklabels(graph, labels = ['extremely low', 'low', 'high', 'extremely high'])
#moving the legend position to underneath the graph, also setting it to have 4 columns so the legend is in a
#straight single line and adding a legend title
plt.legend( loc = "lower center", bbox_to_anchor = (.5, -.4), ncol = 4, title = "Percent Profit Category")
This graph proved very interesting. Movies with an extremely low budget have the highest percentage make-up of making an extremely high percent profit. Movies with an extremely high budget are the most likely to be profitable overall, being that they are the least likely to have a negative profit, with only 5.9% of the movies classified as having an extremely high budget in our dataset made a negative profit. Movies with an low or high budget only make an extremely high percent profit less than 17.1% and 15.6% of the time respectively. They also have the highest chance of making a low or negative profit out of all of the budget categories. Based, on this analysis, percent profits are not uniformally distributed across budget levels. Movies with an extremely high budget are the least likely to have a negative percent profit. Movies with an extremely low budget are the most likely to have an extremely high percent profit. Our recommendation to studios, would be to either have a extremely low or extremely high budget and to veer away from productions with an extremely low or high budget. Further analysis for tighter recommendatios is needed.
#Question: Do big name production companies impact the percent profit?
#To answer this question we are first going to create a for loop that will loop through the production_companies column
#in order movies_discretized_df and store the production company in a list called production_company. The only issue
#with this method, is that if a movie has more than one production company this will not be shown and only the last
#company included in the for loop will be given credit.
production_company = []
for movie in movies_discretized_df['production_companies']:
if "Universal" in movie:
production_company.append("Universal")
elif "Sony" in movie:
production_company.append("Sony")
elif "Fox" in movie:
production_company.append("Fox")
elif "DreamWorks" in movie:
production_company.append("DW")
elif "MGM" in movie:
production_company.append("MGM")
elif "Paramount" in movie:
production_company.append("Paramount")
elif "Disney" in movie:
production_company.append("Disney")
elif "Warner Bros" in movie:
production_company.append("WB")
else:
production_company.append("None")
movies_discretized_df["main_production_co"] = production_company
movies_discretized_df.head()
#Now we want to group by production company and percent profit
'''We want to compare the production company percentage make up for each percent_profit level. To do this we need to
get the count for each production company, the count for each percent_profit level by production company and then divide
the count of the percent_profit/count of production company and multiply by 100. We have to do this for each
production company and level of percent_profits. We think that we could potentially answer this question by group bys.'''
movies_discretized_count_q2 = movies_discretized_df.groupby(["main_production_co", "percent_profit"])["main_production_co"].count()
'''Taking the output from the line above and converting it to a data frame. We are using pandas, which we important as pd.
First, we call the package we are using then the function from that package and then what we want to run the function on.
pd.function(item to use). We are using the DataFrame function from the pandas package on the series created by our group by'''
movies_discretized_count_df_q2 = pd.DataFrame(movies_discretized_count_q2)
#Checking to see what our df looks like.
movies_discretized_count_df_q2
#Changing the column name from budget to counts
movies_discretized_count_df_q2.columns = ["counts"]
#Checking to see what our df looks like.
movies_discretized_count_df_q2.head()
#We want to get a total count for the number of percent_profit counts for each production company.
'''This shows that we have 2 indexes budget and percent_profit... We want to create columns from each index
We are creating a new column named budget by extracting the values from the first index (0) which is the budget
index'''
movies_discretized_count_df_q2["production_company"]=movies_discretized_count_df_q2.index.get_level_values(0)
#We are creating a new column named total_donations by extracting the values from the second index (1) which is total_donations
movies_discretized_count_df_q2["percent_profit_category"] = movies_discretized_count_df_q2.index.get_level_values(1)
#Checking to make sure it worked...
movies_discretized_count_df_q2
#It did!
#Now we want to remove the indexes so, we can create a new group by to get the sum of the counts for each group
#To do this we are using the reset_index(drop = True) This will drop our group by indexes and allow us to create a new one.
movies_discretized_count_df_q2 = movies_discretized_count_df_q2.reset_index(drop = True)
#Now we are getting the sum of each production company category.
production_company_discretized_count_df_q2 = movies_discretized_count_df_q2.groupby(["production_company"])["counts"].sum()
#Checking the results
production_company_discretized_count_df_q2
'''We ultimately want a column that contains the total counts for each production company. We are going to create
a new column that replicates the production company called production_company_count and then we will use the replace function to
replace the 1s with their total count, the 2s with their total count... '''
#First, replicating the income level column in a column named budget_category_count
movies_discretized_count_df_q2["production_company_count"] = movies_discretized_count_df_q2["production_company"]
#Now replacing the income level with the total count for each income level
movies_discretized_count_df_q2["production_company_count"] = movies_discretized_count_df_q2["production_company_count"].replace(["DW"], 1)
movies_discretized_count_df_q2["production_company_count"] = movies_discretized_count_df_q2["production_company_count"].replace(["Disney"], 8)
movies_discretized_count_df_q2["production_company_count"] = movies_discretized_count_df_q2["production_company_count"].replace(["Fox"], 11)
movies_discretized_count_df_q2["production_company_count"] = movies_discretized_count_df_q2["production_company_count"].replace(["None"], 66)
movies_discretized_count_df_q2["production_company_count"] = movies_discretized_count_df_q2["production_company_count"].replace(["Paramount"], 9)
movies_discretized_count_df_q2["production_company_count"] = movies_discretized_count_df_q2["production_company_count"].replace(["Sony"], 8)
movies_discretized_count_df_q2["production_company_count"] = movies_discretized_count_df_q2["production_company_count"].replace(["Universal"], 20)
movies_discretized_count_df_q2["production_company_count"] = movies_discretized_count_df_q2["production_company_count"].replace(["WB"], 13)
movies_discretized_count_df_q2
#Okay, we are one step closer... Now, we need to create a column that takes the counts/budget_category_counts * 100
movies_discretized_count_df_q2["percent"] = movies_discretized_count_df_q2["counts"]/movies_discretized_count_df_q2["production_company_count"] *100
#Looking at our data frame... It worked!!!
movies_discretized_count_df_q2.head()
#We no longer need the count columns
movies_discretized_count_df_q2.drop(["counts", "production_company_count"], axis = 1, inplace = True )
'''Attempting to graph this data using a grouped bar chart:
formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"],
title = "you can set the title of your graph here")'''
graph = movies_discretized_count_df_q2.pivot("production_company", "percent_profit_category",
"percent").plot(kind="bar", color = ["crimson", "salmon", "palegreen", "darkgreen"],
title = "Percent of Percent Profit to Production Company")
#Changing the y label of our graph to Percent
plt.ylabel("Percent")
#Changing the x axis label of our graph to Budget Category
plt.xlabel("Production Company")
#Making it so the tick labels are not angled
plt.xticks(rotation = 0)
#How to change the tick labels (we ended up not needing this, but want to keep for future reference)
#plt.Axes.set_xticklabels(graph, labels = ['extremely low', 'low', 'high', 'extremely high'])
#moving the legend position to underneath the graph, also setting it to have 4 columns so the legend is in a
#straight single line and adding a legend title
plt.legend( loc = "lower center", bbox_to_anchor = (.5, -.4), ncol = 4, title = "Percent Profit Category")
This graph provides some insights, however, most of our movies have more than one main production company and only one production company is being shown. For example, DreamWorks and Universal had a movie named First Man and it was profitable. However, based on the way that we assigned a main production company, only Universal was given credit for that movie.
# Question:
#Does time of the month the movie is released affect percent profit?
'''We want to compare the percent_profit level percentage make up for each time of month. To do this we need to
get the count for each time of month, the count for each percent_profit level by time of month company and then divide
the count of the percent_profit/count of time of month and multiply by 100. We have to do this for each
time of month and level of percent_profits.'''
movies_discretized_count_q3 = movies_discretized_df.groupby(["week", "percent_profit"])["week"].count()
'''Taking the output from the line above and converting it to a data frame. We are using pandas, which we important as pd.
First, we call the package we are using then the function from that package and then what we want to run the function on.
pd.function(item to use). We are using the DataFrame function from the pandas package on the series created by our group by'''
movies_discretized_count_df_q3 = pd.DataFrame(movies_discretized_count_q3)
#Checking to see what our df looks like.
movies_discretized_count_df_q3
#Changing the column name from week to counts
movies_discretized_count_df_q3.columns = ["counts"]
#Checking to see what our df looks like.
movies_discretized_count_df_q3.head()
#We want to get a total count for the number of percent_profit counts for each week.
'''This shows that we have 2 indexes week and percent_profit... We want to create columns from each index
We are creating a new column named week by extracting the values from the first index (0) which is the week
index'''
movies_discretized_count_df_q3["week"]=movies_discretized_count_df_q3.index.get_level_values(0)
#We are creating a new column named total_donations by extracting the values from the second index (1) which is percent_profit
movies_discretized_count_df_q3["percent_profit_category"] = movies_discretized_count_df_q3.index.get_level_values(1)
#Checking to make sure it worked...
movies_discretized_count_df_q3
#It did!
#Now we want to remove the indexes so, we can create a new group by to get the sum of the counts for each group
#To do this we are using the reset_index(drop = True) This will drop our group by indexes and allow us to create a new one.
movies_discretized_count_df_q3 = movies_discretized_count_df_q3.reset_index(drop = True)
# Now we are getting the sum of each production company category.
week_discretized_count_df_q3 = movies_discretized_count_df_q3.groupby(["week"])["counts"].sum()
#Checking the results
week_discretized_count_df_q3
'''We ultimately want a column that contains the total counts for each week. We are going to create
a new column that replicates the week called week_count and then we will use the replace function to
replace the 1s with their total count, the 2s with their total count... '''
#First, replicating the income level column in a column named budget_category_count
movies_discretized_count_df_q3["week_count"] = movies_discretized_count_df_q3["week"]
#Now replacing the income level with the total count for each income level
movies_discretized_count_df_q3["week_count"] = movies_discretized_count_df_q3["week_count"].replace(["week_1"], 37)
movies_discretized_count_df_q3["week_count"] = movies_discretized_count_df_q3["week_count"].replace(["week_2"], 34)
movies_discretized_count_df_q3["week_count"] = movies_discretized_count_df_q3["week_count"].replace(["week_3"], 33)
movies_discretized_count_df_q3["week_count"] = movies_discretized_count_df_q3["week_count"].replace(["week_4"], 32)
movies_discretized_count_df_q3.head()
#Okay, we are one step closer... Now, we need to create a column that takes the counts/week_count * 100
movies_discretized_count_df_q3["percent"] = movies_discretized_count_df_q3["counts"]/movies_discretized_count_df_q3["week_count"] *100
#Looking at our data frame... It worked!!!
movies_discretized_count_df_q3.head()
#We no longer need the count columns
movies_discretized_count_df_q3.drop(["counts", "week_count"], axis = 1, inplace = True )
'''Attempting to graph this data using a grouped bar chart:
formula: df.pivot(columns, group, values).plot(kind = "type of graph", color = ["color to use, can be a list of colors"],
title = "you can set the title of your graph here")'''
graph = movies_discretized_count_df_q3.pivot("week", "percent_profit_category",
"percent").plot(kind="bar", color = ["crimson", "salmon", "palegreen", "darkgreen"],
title = "Percent of Percent Profit to Week")
#Changing the y label of our graph to Percent
plt.ylabel("Percent")
#Changing the x axis label of our graph to Budget Category
plt.xlabel("Week")
#Making it so the tick labels are not angled
plt.xticks(rotation = 0)
#How to change the tick labels (we ended up not needing this, but want to keep for future reference)
#plt.Axes.set_xticklabels(graph, labels = ['extremely low', 'low', 'high', 'extremely high'])
#moving the legend position to underneath the graph, also setting it to have 4 columns so the legend is in a
#straight single line and adding a legend title
plt.legend( loc = "lower center", bbox_to_anchor = (.5, -.4), ncol = 4, title = "Percent Profit Category")
This is interesting in that it shows that movies released within the first two weeks of the month tend to be more profitable. We would like to look at a breakdown of month to percent profit for further analysis
# Taking a brief detour back to our non-discretized df
movies_original_df.head()
# QUESTION: Do "Good" Movies Make Money? -- We're defining "Good" as vote average
plt.plot(movies_original_df.profit, movies_original_df.vote_average, 'o')
plt.title('Do "Good" Movies Make Money?')
plt.xlabel('Profit')
plt.ylabel('Vote Average')
plt.show()
# QUESTION: Does Popularity = Profit?
plt.plot(movies_original_df.profit, movies_original_df.popularity, 'o')
plt.title('Does Popularity = Profits?')
plt.xlabel('Profit')
plt.ylabel('Popularity')
plt.show()
# QUESTION: How does budget impact vote average?
plt.plot(movies_original_df.budget, movies_original_df.vote_average, 'o')
plt.title('How does Budget Impact Vote Average?')
plt.xlabel('Budget')
plt.ylabel('Vote Average')
plt.show()
# QUESTION: How does budget impact popularity?
plt.plot(movies_original_df.budget, movies_original_df.popularity, 'o')
plt.title('How does Budget Impact Popularity?')
plt.xlabel('Budget')
plt.ylabel('Popularity')
plt.show()
# QUESTION: Is there a relationship between "Above Average Movies" and Budget/Price?
below_avg = movies_original_df[movies_original_df.vote_average < 6.5]
above_avg = movies_original_df[movies_original_df.vote_average >= 6.5]
plt.plot(below_avg.budget, below_avg.profit, 'o', label="below average")
plt.plot(above_avg.budget, above_avg.profit, 'o', label="above average")
plt.title('BUDGET vs PROFIT by AVERAGE VOTE!')
plt.xlabel('BUDGET')
plt.ylabel('PROFIT')
plt.legend()
plt.show()
# QUESTION: Is there a relationship between "Above Average Movies" and Budget/Price?
below_avg = movies_original_df[movies_original_df.vote_average < 6.5]
above_avg = movies_original_df[movies_original_df.vote_average >= 6.5]
plt.plot(below_avg.budget, below_avg.percent_profit, 'o', label="below average")
plt.plot(above_avg.budget, above_avg.percent_profit, 'o', label="above average")
plt.title('BUDGET vs PERCENT PROFIT by AVERAGE VOTE!')
plt.xlabel('BUDGET')
plt.ylabel('PROFIT')
plt.legend()
plt.show()
# BIG QUESTION: What role do production companies play in the entertainment industry?
# Is there a relationship between production studio and average vote?
# Production studio and budget?
# Production studio and percent profit?
# Adding the BIG EIGHT Production Studios to the DF
# WARNER BROS
wb = []
for movie in movies_original_df['production_companies']:
if "Warner Bros" in movie:
wb.append(True)
else:
wb.append(False)
movies_original_df['wb'] = wb
# MGM
mgm = []
for movie in movies_original_df['production_companies']:
if "MGM" in movie:
mgm.append(True)
else:
mgm.append(False)
movies_original_df['mgm'] = mgm
# DREAMWORKS
dw = []
for movie in movies_original_df['production_companies']:
if "DreamWorks" in movie:
dw.append(True)
else:
dw.append(False)
movies_original_df['dw'] = dw
# SONY
sony = []
for movie in movies_original_df['production_companies']:
if "Sony" in movie:
sony.append(True)
else:
sony.append(False)
movies_original_df['sony'] = sony
# DISNEY
disney = []
for movie in movies_original_df['production_companies']:
if "Disney" in movie:
disney.append(True)
else:
disney.append(False)
movies_original_df['disney'] = disney
#FOX
fox = []
for movie in movies_original_df['production_companies']:
if "Century Fox" in movie:
fox.append(True)
else:
fox.append(False)
movies_original_df['fox'] = fox
# PARAMOUNT
paramount = []
for movie in movies_original_df['production_companies']:
if "Paramount" in movie:
paramount.append(True)
else:
paramount.append(False)
movies_original_df['paramount'] = paramount
#UNIVERSAL
universal = []
for movie in movies_original_df['production_companies']:
if "Universal" in movie:
universal.append(True)
else:
universal.append(False)
movies_original_df['universal'] = universal
sony = movies_original_df[movies_original_df.sony == True]
wb = movies_original_df[movies_original_df.wb == True]
disney = movies_original_df[movies_original_df.disney == True]
fox = movies_original_df[movies_original_df.fox == True]
universal = movies_original_df[movies_original_df.universal == True]
paramount = movies_original_df[movies_original_df.paramount == True]
dw = movies_original_df[movies_original_df.dw == True]
mgm = movies_original_df[movies_original_df.mgm == True]
plt.plot(sony.budget, sony.revenue, 'o', label="Sony")
plt.plot(wb.budget, wb.revenue, 'o', label="Warner Bros.")
plt.plot(disney.budget, disney.revenue, 'o', label="Disney")
plt.plot(fox.budget, fox.revenue, 'o', label="Fox")
plt.plot(universal.budget, universal.revenue, 'o', label="Universal")
plt.plot(paramount.budget, paramount.revenue, 'o', label="Paramount")
plt.plot(dw.budget, dw.revenue, 'o', label="DreamWorks")
plt.plot(mgm.budget, mgm.revenue, 'o', label="MGM")
plt.title('BUDGET vs REVENUE by PRODUCTION COMPANY')
plt.xlabel('BUDGET')
plt.ylabel('REVENUE')
plt.legend( loc = "lower center", bbox_to_anchor = (.5, -.4), ncol = 4, title = "Production Company")
plt.show()
sony = movies_original_df[movies_original_df.sony == True]
wb = movies_original_df[movies_original_df.wb == True]
disney = movies_original_df[movies_original_df.disney == True]
fox = movies_original_df[movies_original_df.fox == True]
universal = movies_original_df[movies_original_df.universal == True]
paramount = movies_original_df[movies_original_df.paramount == True]
dw = movies_original_df[movies_original_df.dw == True]
mgm = movies_original_df[movies_original_df.mgm == True]
plt.plot(sony.budget, sony.percent_profit, 'o', label="Sony")
plt.plot(wb.budget, wb.percent_profit, 'o', label="Warner Bros.")
plt.plot(disney.budget, disney.percent_profit, 'o', label="Disney")
plt.plot(fox.budget, fox.percent_profit, 'o', label="Fox")
plt.plot(universal.budget, universal.percent_profit, 'o', label="Universal")
plt.plot(paramount.budget, paramount.percent_profit, 'o', label="Paramount")
plt.plot(dw.budget, dw.percent_profit, 'o', label="DreamWorks")
plt.plot(mgm.budget, mgm.percent_profit, 'o', label="MGM")
plt.title('BUDGET vs PERCENT PROFIT by PRODUCTION COMPANY')
plt.xlabel('BUDGET')
plt.ylabel('PERCENT PROFIT')
plt.legend( loc = "lower center", bbox_to_anchor = (.5, -.4), ncol = 4, title = "Production Company")
plt.show()