HOW WE FEEL ABOUT HARRY

Scraping IMDB for Harry Potter reviews

Outputs txt files for corpus creation!!

In [ ]:
import re
import urllib
from bs4 import BeautifulSoup
In [ ]:
def get_reviews(rating):
    url = "https://www.imdb.com/title/tt0241527/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=" + str(rating)
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.findAll("div", {"class": "imdb-user-review"})
    for num,t in enumerate(text):
        scale = t.find("span", {"class": "point-scale"})
        review = t.find("div", {"class": "text show-more__control"})
        try:
            print_to_file(scale.previous_sibling.text, review.text, num)
        except:
            print('nope')

def print_to_file(rating, review, num):
    both = rating + '**' + review
    output_filename = str(rating) + '_hp_' + str(num) + '.txt'
    outfile = open(output_filename, 'w')
    outfile.write(both)
    outfile.close()
        
for rating in range(10):
    get_reviews(rating)