import pandas as pd
import numpy as np
tn_df = pd.read_csv('0909_TN_all.csv')
def return_just_num(num):
num = num.split('$')[1]
num = num.replace(',','')
return(num)
tn_df['ww_bo'] = tn_df.apply(lambda x: return_just_num(x['WorldwideBoxOffice']),axis=1)
tn_df['int_bo'] = tn_df.apply(lambda x: return_just_num(x['InternationalBoxOffice']),axis=1)
tn_df['dom_bo'] = tn_df.apply(lambda x: return_just_num(x['DomesticBoxOffice']),axis=1)
tn_df['prod_budget'] = tn_df.apply(lambda x: return_just_num(x['ProductionBudget']),axis=1)
tn_df
len(tn_df)
# for t in tn_df:
# print(t)
##################################
## GET PRODUCTION STAFF
##################################
all_movies_production = []
for index, row in tn_df.iterrows():
all_production = []
for person in eval(row['production']):
new_row = {}
new_row.update({ 'title': row['Title'] })
new_row.update({ 'role': list(person.keys())[0] })
new_row.update({ 'person': list(person.values())[0] })
new_row.update({ 'dom_percent_profit': (float(row['dom_bo'])/(float(row['dom_bo']) - float(row['prod_budget'])))*100 })
new_row.update({ 'int_percent_profit': (float(row['int_bo'])/(float(row['int_bo']) - float(row['prod_budget'])))*100 })
new_row.update({ 'genre': row['Genre'] })
new_row.update({ 'theatrical_distributor': row['TheatricalDistributor'] })
new_row.update({ 'budget': row['prod_budget'] })
new_row.update({ 'ww_bo': row['ww_bo'] })
new_row.update({ 'dom_bo': row['dom_bo'] })
new_row.update({ 'int_bo': row['int_bo'] })
new_row.update({ 'source': row['Source'] })
new_row.update({ 'creative_type': row['CreativeType'] })
new_row.update({ 'released': row['Released_2'] })
all_production.append(new_row)
all_movies_production += all_production
##################################
## GET LEADS
##################################
all_movies_leads = []
for index, row in tn_df.iterrows():
new_row = {}
all_leads = []
for person in eval(row['leads']):
new_row = {}
new_row.update({ 'title': row['Title'] })
new_row.update({ 'role': 'lead' })
new_row.update({ 'person': person })
new_row.update({ 'dom_percent_profit': (float(row['dom_bo'])/(float(row['dom_bo']) - float(row['prod_budget'])))*100 })
new_row.update({ 'int_percent_profit': (float(row['int_bo'])/(float(row['int_bo']) - float(row['prod_budget'])))*100 })
new_row.update({ 'genre': row['Genre'] })
new_row.update({ 'theatrical_distributor': row['TheatricalDistributor'] })
new_row.update({ 'budget': row['prod_budget'] })
new_row.update({ 'ww_bo': row['ww_bo'] })
new_row.update({ 'dom_bo': row['dom_bo'] })
new_row.update({ 'int_bo': row['int_bo'] })
new_row.update({ 'source': row['Source'] })
new_row.update({ 'creative_type': row['CreativeType'] })
new_row.update({ 'released': row['Released_2'] })
all_leads.append(new_row)
all_movies_leads += all_leads
##################################
## GET SUPPORTING CAST
##################################
all_movies_supporting = []
for index, row in tn_df.iterrows():
new_row = {}
all_supporting = []
for person in eval(row['supporting']):
new_row = {}
new_row.update({ 'title': row['Title'] })
new_row.update({ 'role': 'supporting' })
new_row.update({ 'person': person })
new_row.update({ 'dom_percent_profit': (float(row['dom_bo'])/(float(row['dom_bo']) - float(row['prod_budget'])))*100 })
new_row.update({ 'int_percent_profit': (float(row['int_bo'])/(float(row['int_bo']) - float(row['prod_budget'])))*100 })
new_row.update({ 'genre': row['Genre'] })
new_row.update({ 'theatrical_distributor': row['TheatricalDistributor'] })
new_row.update({ 'budget': row['prod_budget'] })
new_row.update({ 'ww_bo': row['ww_bo'] })
new_row.update({ 'dom_bo': row['dom_bo'] })
new_row.update({ 'int_bo': row['int_bo'] })
new_row.update({ 'source': row['Source'] })
new_row.update({ 'creative_type': row['CreativeType'] })
new_row.update({ 'released': row['Released_2'] })
all_supporting.append(new_row)
all_movies_supporting += all_supporting
print(len(all_movies_production))
print(len(all_movies_leads))
print(len(all_movies_supporting))
all_movies = []
all_movies += all_movies_production
all_movies += all_movies_leads
all_movies += all_movies_supporting
all_movies_df = pd.DataFrame(all_movies)
all_movies_df = all_movies_df.drop_duplicates()
all_movies_df['person_count_overall'] = all_movies_df.groupby('person')['person'].transform('count')
all_movies_df['person_role_count'] = all_movies_df.groupby(['person', 'role'])['person'].transform('count')
all_movies_df['person_role_count'] = all_movies_df.groupby(['person', 'role'])['person'].transform('count')
df = all_movies_df
import datetime
# filename = 'TN_longform_' + str(datetime.datetime.now())
filename = 'TN_longform'
all_movies_df.to_csv(filename + '.csv')
## TESTING
len(df[(df['role'] == 'Composer') & (df['person_count_overall'] > 2)])
## Should be 796
## FUN QUERIES!!
# df[df['person'] == 'Chris Columbus']
# df.sort_values(by='person_count_overall', ascending=False)[:100]
# randy = all_movies_df[all_movies_df['person'] == 'Randy Thom']
# randy.groupby('').count()
time = str(datetime.datetime.now())
str(time)