In [111]:
import pandas as pd
import numpy as np

tn_df = pd.read_csv('0909_TN_all.csv')

def return_just_num(num):
    num = num.split('$')[1]
    num = num.replace(',','')
    return(num)
    
tn_df['ww_bo'] = tn_df.apply(lambda x: return_just_num(x['WorldwideBoxOffice']),axis=1)
tn_df['int_bo'] = tn_df.apply(lambda x: return_just_num(x['InternationalBoxOffice']),axis=1)
tn_df['dom_bo'] = tn_df.apply(lambda x: return_just_num(x['DomesticBoxOffice']),axis=1)
tn_df['prod_budget'] = tn_df.apply(lambda x: return_just_num(x['ProductionBudget']),axis=1)    
tn_df
len(tn_df)
# for t in tn_df:
#     print(t)
Out[111]:
1774
In [104]:
##################################
## GET PRODUCTION STAFF
##################################
all_movies_production = []
for index, row in tn_df.iterrows():
    all_production = []
    for person in eval(row['production']):
        new_row = {}
        new_row.update({ 'title': row['Title'] })
        new_row.update({ 'role': list(person.keys())[0] })
        new_row.update({ 'person': list(person.values())[0] })
        new_row.update({ 'dom_percent_profit': (float(row['dom_bo'])/(float(row['dom_bo']) - float(row['prod_budget'])))*100 })
        new_row.update({ 'int_percent_profit': (float(row['int_bo'])/(float(row['int_bo']) - float(row['prod_budget'])))*100 })      
        new_row.update({ 'genre': row['Genre'] })
        new_row.update({ 'theatrical_distributor': row['TheatricalDistributor'] })
        new_row.update({ 'budget': row['prod_budget'] })
        new_row.update({ 'ww_bo': row['ww_bo'] })
        new_row.update({ 'dom_bo': row['dom_bo'] })
        new_row.update({ 'int_bo': row['int_bo'] })
        new_row.update({ 'source': row['Source'] })
        new_row.update({ 'creative_type': row['CreativeType'] })
        new_row.update({ 'released': row['Released_2'] })
        all_production.append(new_row)
    all_movies_production += all_production
In [106]:
##################################
## GET LEADS
##################################
all_movies_leads = []
for index, row in tn_df.iterrows():
    new_row = {}
    all_leads = []
    for person in eval(row['leads']):
        new_row = {}
        new_row.update({ 'title': row['Title'] })
        new_row.update({ 'role': 'lead' })
        new_row.update({ 'person': person })
        new_row.update({ 'dom_percent_profit': (float(row['dom_bo'])/(float(row['dom_bo']) - float(row['prod_budget'])))*100 })
        new_row.update({ 'int_percent_profit': (float(row['int_bo'])/(float(row['int_bo']) - float(row['prod_budget'])))*100 })      
        new_row.update({ 'genre': row['Genre'] })
        new_row.update({ 'theatrical_distributor': row['TheatricalDistributor'] })
        new_row.update({ 'budget': row['prod_budget'] })
        new_row.update({ 'ww_bo': row['ww_bo'] })
        new_row.update({ 'dom_bo': row['dom_bo'] })
        new_row.update({ 'int_bo': row['int_bo'] })
        new_row.update({ 'source': row['Source'] })
        new_row.update({ 'creative_type': row['CreativeType'] })
        new_row.update({ 'released': row['Released_2'] })
        all_leads.append(new_row)
    all_movies_leads += all_leads
In [105]:
##################################
## GET SUPPORTING CAST
##################################
all_movies_supporting = []
for index, row in tn_df.iterrows():
    new_row = {}
    all_supporting = []
    for person in eval(row['supporting']):
        new_row = {}
        new_row.update({ 'title': row['Title'] })
        new_row.update({ 'role': 'supporting' })
        new_row.update({ 'person': person })
        new_row.update({ 'dom_percent_profit': (float(row['dom_bo'])/(float(row['dom_bo']) - float(row['prod_budget'])))*100 })
        new_row.update({ 'int_percent_profit': (float(row['int_bo'])/(float(row['int_bo']) - float(row['prod_budget'])))*100 })      
        new_row.update({ 'genre': row['Genre'] })
        new_row.update({ 'theatrical_distributor': row['TheatricalDistributor'] })
        new_row.update({ 'budget': row['prod_budget'] })
        new_row.update({ 'ww_bo': row['ww_bo'] })
        new_row.update({ 'dom_bo': row['dom_bo'] })
        new_row.update({ 'int_bo': row['int_bo'] })
        new_row.update({ 'source': row['Source'] })
        new_row.update({ 'creative_type': row['CreativeType'] })
        new_row.update({ 'released': row['Released_2'] })
        all_supporting.append(new_row)
    all_movies_supporting += all_supporting
In [107]:
print(len(all_movies_production))
print(len(all_movies_leads))
print(len(all_movies_supporting))
67226
5614
48353
In [108]:
all_movies = []
all_movies += all_movies_production
all_movies += all_movies_leads
all_movies += all_movies_supporting
all_movies_df = pd.DataFrame(all_movies)
all_movies_df = all_movies_df.drop_duplicates()
all_movies_df['person_count_overall'] = all_movies_df.groupby('person')['person'].transform('count')
all_movies_df['person_role_count'] = all_movies_df.groupby(['person', 'role'])['person'].transform('count')
all_movies_df['person_role_count'] = all_movies_df.groupby(['person', 'role'])['person'].transform('count')
df = all_movies_df

import datetime 
# filename = 'TN_longform_' + str(datetime.datetime.now())
filename = 'TN_longform'
all_movies_df.to_csv(filename + '.csv')
In [109]:
## TESTING
len(df[(df['role'] == 'Composer') & (df['person_count_overall'] > 2)])
## Should be 796
Out[109]:
796
In [85]:
## FUN QUERIES!!
# df[df['person'] == 'Chris Columbus']
# df.sort_values(by='person_count_overall', ascending=False)[:100]
# randy = all_movies_df[all_movies_df['person'] == 'Randy Thom']
# randy.groupby('').count()
In [98]:
time = str(datetime.datetime.now())
str(time)
Out[98]:
'2019-09-12 14:24:31.371608'
In [ ]: