Outputs txt files for corpus creation!!
import re
import urllib
from bs4 import BeautifulSoup
def get_reviews(rating):
url = "https://www.imdb.com/title/tt0241527/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=" + str(rating)
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
text = soup.findAll("div", {"class": "imdb-user-review"})
for num,t in enumerate(text):
scale = t.find("span", {"class": "point-scale"})
title = t.find("a", {"class": "title"})
text = t.find("div", {"class": "text show-more__control"})
review = title.text + "==" + text.text
try:
print_to_file(scale.previous_sibling.text, review, num)
except:
print('nope')
def print_to_file(rating, review, num):
both = rating + '**' + review
output_filename = str(rating) + '_hp_' + str(num) + '.txt'
outfile = open(output_filename, 'w')
outfile.write(both)
outfile.close()
for rating in range(1,11):
get_reviews(rating)