HOW WE FEEL ABOUT HARRY¶

Scraping IMDB for Harry Potter reviews¶

Outputs txt files for corpus creation!!

import re
import urllib
from bs4 import BeautifulSoup

def get_reviews(rating):
    url = "https://www.imdb.com/title/tt0241527/reviews?sort=helpfulnessScore&dir=desc&ratingFilter=" + str(rating)
    html = urllib.request.urlopen(url).read()
    soup = BeautifulSoup(html, 'html.parser')
    text = soup.findAll("div", {"class": "imdb-user-review"})
    for num,t in enumerate(text):
        scale = t.find("span", {"class": "point-scale"})
        title = t.find("a", {"class": "title"})
        text = t.find("div", {"class": "text show-more__control"})
        review = title.text + "==" + text.text
        try:
            print_to_file(scale.previous_sibling.text, review, num)
        except:
            print('nope')

def print_to_file(rating, review, num):
    both = rating + '**' + review
    output_filename = str(rating) + '_hp_' + str(num) + '.txt'
    outfile = open(output_filename, 'w')
    outfile.write(both)
    outfile.close()
        
for rating in range(1,11):
    get_reviews(rating)

nope
nope
nope
nope
nope
nope