import re
import urllib
from bs4 import BeautifulSoup
def get_reviews():
url = "https://www.vanityfair.com/hollywood/2019/11/ultimate-guide-to-awards-season"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# text = soup.findAll("span", {"class": "category-entry-module--title--3YP5a"})
text = soup.findAll("div", {"class": "slide-module--slide--m310h"})
all_nominations = []
for num,t in enumerate(text):
category = t.find('h3').text
all_content = [content.findAll('span') for content in t.find('div').findAll('div')[2].findAll('div')]
all_in_category = []
for content in all_content:
temp = { 'movie': '', 'nominated': ''}
try:
try:
temp['movie'] = content[3].findAll('span')[0].text
temp['nominated'] = content[3].findAll('span')[1].text
except:
temp['movie'] = content[3].findAll('span')[0].text
temp['nominated'] = content[3].findAll('span')[1].text
except:
pass
all_in_category.append(temp)
all_nominations.append({ category : all_in_category })
return all_nominations
results = get_reviews()
import pandas as pd
import numpy as np
best_picture = pd.DataFrame(results[0]['BestPicture'])
best_picture.drop_duplicates(inplace=True)
best_picture['movie'].replace('', np.nan, inplace=True)
best_picture.dropna(inplace=True)
best_picture
best_director = pd.DataFrame(results[1]['BestDirector'])
best_director.drop_duplicates(inplace=True)
best_director['movie'].replace('', np.nan, inplace=True)
best_director.dropna(inplace=True)
best_director
best_actor = pd.DataFrame(results[2]['BestActor'])
best_actor.drop_duplicates(inplace=True)
best_actor['movie'].replace('', np.nan, inplace=True)
best_actor.dropna(inplace=True)
best_actor
best_actress = pd.DataFrame(results[3]['BestActress'])
best_actress.drop_duplicates(inplace=True)
best_actress['movie'].replace('', np.nan, inplace=True)
best_actress.dropna(inplace=True)
best_actress