WHAT MAKES AN OSCAR WINNER?

Scraping IMSDB for Scripts

Outputs txt files for corpus creation!!

In [2]:
import re
import urllib
from bs4 import BeautifulSoup
In [ ]:
# url = "http://awardsdatabase.oscars.org/Search/GetResults?query=%7B%22AwardShowFrom%22:91,%22Sort%22:%223-Award%20Category-Chron%22,%22Search%22:%22Basic%22%7D"
In [50]:
def get_reviews(movie):
    try:
        url = "https://www.imsdb.com/scripts/"+ movie+".html"
    #     url = "https://www.imdb.com/title/tt0241527/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=" + str(rating)
        html = urllib.request.urlopen(url).read()
        soup = BeautifulSoup(html, 'html.parser')
#         print('yes for', movie)
        text = soup.findAll("td", {"class": "scrtext"})
#         print(text.text())
#         for num,t in enumerate(text):
#             scale = t.find("span", {"class": "point-scale"})
#             review = t.find("div", {"class": "text show-more__control"})
        try:
            print_to_file(movie, text)
        except:
            print('fake out')
    except:
        print('nope for', movie)

def print_to_file(movie, script):
    output_filename = movie + '_script.txt'
    outfile = open(output_filename, 'w')
    outfile.write(str(script))
    outfile.close()
        
# for rating in range(10):
#     get_reviews(rating)
In [51]:
def get_movies():
    url = "https://www.today.com/popculture/complete-list-every-best-picture-oscar-winner-ever-t107617"
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.findAll("li")
    for num,t in enumerate(text):
        print(t.text)
        # Kendra manually copypasted these for quickness
# get_movies()
In [53]:
file = open('FinalProject/oscar_movies.txt').readlines()
# import re


# bad_chars = ''
# rgx = re.compile('[%s]' % bad_chars)

all_movies = []
for line in file:
    movie = line.split('-')[1].strip()
    movie = movie.replace('"', '')
    movie = movie.replace(' ', '-')
    all_movies.append(movie)
    
for movie in all_movies:
    get_reviews(movie)
# get_reviews('Birdman')
nope for Schindler’s-List
nope for Unforgiven
nope for Casablanca
In [40]:
all_movies
Out[40]:
['Green-Book',
 'The-Shape-of-Water',
 'Moonlight',
 'Spotlight',
 'Birdman',
 '12-Years-a-Slave',
 'Argo',
 'The-Artist',
 "The-King's-Speech",
 'The-Hurt-Locker',
 'Slumdog-Millionaire',
 'No-Country-for-Old-Men',
 'The-Departed',
 'Crash',
 'Million-Dollar-Baby',
 'The-Lord-of-the-Rings:-The-Return-of-the-King',
 'Chicago',
 'A-Beautiful-Mind',
 'Gladiator',
 'American-Beauty',
 'Shakespeare-in-Love',
 'Titanic',
 'The-English-Patient',
 'Braveheart',
 'Forrest-Gump',
 'Schindler’s-List',
 'Unforgiven',
 'The-Silence-of-the-Lambs',
 'Dances-With-Wolves',
 'Driving-Miss-Daisy',
 'Rain-Man',
 'The-Last-Emperor',
 'Platoon',
 'Out-of-Africa',
 'Amadeus',
 'Terms-of-Endearment',
 'Gandhi',
 'Chariots-of-Fire',
 'Ordinary-People',
 'Kramer-vs.-Kramer',
 'The-Deer-Hunter',
 'Annie-Hall',
 'Rocky',
 "One-Flew-over-the-Cuckoo's-Nest",
 'The-Godfather-Part-II',
 'The-Sting',
 'The-Godfather',
 'The-French-Connection',
 'Patton',
 'Midnight-Cowboy',
 'Oliver!',
 'In-the-Heat-of-the-Night',
 'A-Man-for-All-Seasons',
 'The-Sound-of-Music',
 'My-Fair-Lady',
 'Tom-Jones',
 'Lawrence-of-Arabia',
 'West-Side-Story',
 'The-Apartment',
 'Ben',
 'Gigi',
 'The-Bridge-on-the-River-Kwai',
 'Around-the-World-in-80-Days',
 'Marty',
 'On-the-Waterfront',
 'From-Here-to-Eternity',
 'The-Greatest-Show-on-Earth',
 'An-American-in-Paris',
 'All-About-Eve',
 'All-the-Kings-Men',
 'Hamlet',
 "Gentleman's-Agreement",
 'The-Best-Years-of-Our-Lives',
 'The-Lost-Weekend',
 'Going-My-Way',
 'Casablanca',
 'Mrs.-Miniver',
 'How-Green-Was-My-Valley',
 'Rebecca',
 'Gone-with-the-Wind',
 "You-Can't-Take-It-with-You",
 'The-Life-of-Emile-Zola',
 'The-Great-Ziegfeld',
 'Mutiny-on-the-Bounty',
 'It-Happened-One-Night',
 'Cavalcade',
 'Grand-Hotel',
 'Cimarron',
 'All-Quiet-on-the-Western-Front',
 'The-Broadway-Melody',
 'Wings']
In [ ]: