daily log 12.16.20

2 minute read

THINGS I LEARNED TODAY

  1. If getting errors with eval Use json.loads
import json

url = "https://www.halfbakedharvest.com/one-pan-four-cheese-sun-dried-tomato-and-spinach-drunken-pasta-bake/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
searched_word = 'wprmpuc_recipe_'
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
clean_result = results[0].split('=')[1].split(';')[0].strip()

info_dict = json.loads(clean_result)

ANNA EXTENSION

IMPETUS: We want to be able to aggregate recipe data

PROBLEM: All recipe blogs are slightly different, format-wise

SOLUTION: Find something all recipe pages in all recipe blogs have in common – THE WORD PRINT!!

PROBLEM: How… how will we find this word in each page in such a way it can be used across multiple different blogs?

SOLUTION: Beautiful soup and regex!!

url = "https://minimalistbaker.com/orange-cranberry-crisp-gluten-free-easy/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
searched_word = 'Print'
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
results[0].parent['href']

ALL TOGETHER NOW!!

# url = "https://www.gimmesomeoven.com/poblano-white-chicken-chili/"
# url = "https://www.skinnytaste.com/lentil-soup-with-butternut-and-kale/"
# url = "https://minimalistbaker.com/orange-cranberry-crisp-gluten-free-easy/"
# NOPE url = "https://www.halfbakedharvest.com/one-pan-four-cheese-sun-dried-tomato-and-spinach-drunken-pasta-bake/"
# url = "https://www.twopeasandtheirpod.com/magic-cookie-bars/"
# url = "https://thedefineddish.com/miso-roasted-chicken/"
# url = "https://www.ambitiouskitchen.com/coconut-curried-brown-rice/"
# url = "https://whatsgabycooking.com/chicken-larb-bowls/"
# url = "https://paleomg.com/paleo-blueberry-chai-muffins/"
# NOPE url = "https://pinchofyum.com/30-minute-vegetarian-meatballs"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
searched_word = 'Print'
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
results[0].parent['href']

PROBLEM: Ok great! We found the word PRINT, but… now what??

SIDE QUEST PROBLEM:

Doesn’t work for HBH

Quick workaround?

import json

url = "https://www.halfbakedharvest.com/one-pan-four-cheese-sun-dried-tomato-and-spinach-drunken-pasta-bake/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
searched_word = 'wprmpuc_recipe_'
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
clean_result = results[0].split('=')[1].split(';')[0].strip()

info_dict = json.loads(clean_result)
info_dict

GET INGREDIENTS

multi_blogs = ["https://www.gimmesomeoven.com/poblano-white-chicken-chili/",
"https://www.skinnytaste.com/lentil-soup-with-butternut-and-kale/",
"https://minimalistbaker.com/orange-cranberry-crisp-gluten-free-easy/",
"https://www.twopeasandtheirpod.com/magic-cookie-bars/",
"https://thedefineddish.com/miso-roasted-chicken/",
"https://www.ambitiouskitchen.com/coconut-curried-brown-rice/",
"https://whatsgabycooking.com/chicken-larb-bowls/",
"https://paleomg.com/paleo-blueberry-chai-muffins/"]

def get_print_link(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    searched_word = 'Print'
    results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
    print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
    return results[0].parent['href']
print_links = []
for blog in multi_blogs:
    print_links.append(get_print_link(blog))

def get_ingredients_from_link(url):
    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')


    # recipe_name = soup.find_all('h3', 'wprm-recipe-name')[0].text.strip()
    recipe_name = 'test'
    ingredients = soup.find_all('li',"wprm-recipe-ingredient")
    all_ingredients = []

    for i in ingredients:
        try:
            amount = i.find_all("span","wprm-recipe-ingredient-amount")
            amount = amount[0].text
    #             print(amount[0].text)
        except:
            amount = 'no amount'
            print('no amount')

        try:
            unit = i.find_all("span","wprm-recipe-ingredient-unit")
            unit = unit[0].text
    #             print(unit[0].text)
        except:
            unit = 'no unit'
            print('no unit')

        try:
            name = i.find_all("span","wprm-recipe-ingredient-name")
            name = name[0].text
    #             print(name[0].text)
        except:
            name = 'no name'
            print('no name')


        all_ingredients.append({'url': url, 
                                'recipe_name': recipe_name,
                                'amount': amount, 
                                'unit': unit, 
                                'name': name})

    print(all_ingredients)
    return all_ingredients

not_working = []
for link in print_links:
    print('=============================================')
    print(link)
    ingredients = get_ingredients_from_link(link)
    if len(ingredients) == 0:
        not_working.append(link)

GET SHOPPING LIST


# result = [format_ingredients(x) for x in sm_df['ingredients']]
shopping_list = {}
def add_ingredients_to_dictionary(formatted_ingredient):
#     print(formatted_ingredient)
    ingredient = formatted_ingredient['name']
    amount = formatted_ingredient['amount']
    unit = formatted_ingredient['unit']
    amount_unit = "{}({})".format(amount, unit)
    if ingredient in shopping_list:
        shopping_list[ingredient] = shopping_list[ingredient] + ' + ' + amount_unit
    else:
        shopping_list[ingredient] = amount_unit
        
my_list = [add_ingredients_to_dictionary(x) for x in results_flattened]