Outputs txt files for corpus creation!!
import re
import urllib
from bs4 import BeautifulSoup
def get_reviews(rating):
url = "https://www.imdb.com/title/tt0241527/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=" + str(rating)
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
text = soup.findAll("div", {"class": "imdb-user-review"})
for num,t in enumerate(text):
scale = t.find("span", {"class": "point-scale"})
review = t.find("div", {"class": "text show-more__control"})
try:
print_to_file(scale.previous_sibling.text, review.text, num)
except:
print('nope')
def print_to_file(rating, review, num):
both = rating + '**' + review
output_filename = str(rating) + '_hp_' + str(num) + '.txt'
outfile = open(output_filename, 'w')
outfile.write(both)
outfile.close()
for rating in range(10):
get_reviews(rating)