Outputs txt files for corpus creation!!
import re
import urllib
from bs4 import BeautifulSoup
# url = "http://awardsdatabase.oscars.org/Search/GetResults?query=%7B%22AwardShowFrom%22:91,%22Sort%22:%223-Award%20Category-Chron%22,%22Search%22:%22Basic%22%7D"
def get_reviews(movie):
try:
url = "https://www.imsdb.com/scripts/"+ movie+".html"
# url = "https://www.imdb.com/title/tt0241527/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=" + str(rating)
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
# print('yes for', movie)
text = soup.findAll("td", {"class": "scrtext"})
# print(text.text())
# for num,t in enumerate(text):
# scale = t.find("span", {"class": "point-scale"})
# review = t.find("div", {"class": "text show-more__control"})
try:
print_to_file(movie, text)
except:
print('fake out')
except:
print('nope for', movie)
def print_to_file(movie, script):
output_filename = movie + '_script.txt'
outfile = open(output_filename, 'w')
outfile.write(str(script))
outfile.close()
# for rating in range(10):
# get_reviews(rating)
def get_movies():
url = "https://www.today.com/popculture/complete-list-every-best-picture-oscar-winner-ever-t107617"
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
text = soup.findAll("li")
for num,t in enumerate(text):
print(t.text)
# Kendra manually copypasted these for quickness
# get_movies()
file = open('FinalProject/oscar_movies.txt').readlines()
# import re
# bad_chars = ''
# rgx = re.compile('[%s]' % bad_chars)
all_movies = []
for line in file:
movie = line.split('-')[1].strip()
movie = movie.replace('"', '')
movie = movie.replace(' ', '-')
all_movies.append(movie)
for movie in all_movies:
get_reviews(movie)
# get_reviews('Birdman')
all_movies