daily log 11.19.20

1 minute read

How to scrape data from your favorite recipe blogs so you can sort by ingredients you have and how much time you want to spend!!

To get “ingredients” df (which is what we really want/need), we take the “original” df and simply do this:

WORKS FOR BOTH!!

import requests
from bs4 import BeautifulSoup



# url = "https://www.halfbakedharvest.com/wprm_print/73488"
url = "https://minimalistbaker.com/wprm_print/69445"
def get_recipe_ingredients(url):

    r = requests.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')

    recipe_name = soup.find_all('h2', 'wprm-recipe-name')[0].text.strip()
    recipe_img = soup.find_all('div', 'wprm-recipe-image')[0].find_all('img')[0]['src']
    
    rating = soup.find_all("span", 'wprm-recipe-rating-average')[0].text.strip()
    vote_count = soup.find_all("span", 'wprm-recipe-rating-count')[0].text.strip()

    ingredients = soup.find_all("li","wprm-recipe-ingredient")
    
    all_ingredients = []

    for i in ingredients:
        try:
            amount = i.find_all("span","wprm-recipe-ingredient-amount")
            amount = amount[0].text
        except:
            amount = 'no amount'

        try:
            unit = i.find_all("span","wprm-recipe-ingredient-unit")
            unit = unit[0].text
        except:
            unit = 'no unit'


        try:
            name = i.find_all("span","wprm-recipe-ingredient-name")
            name = name[0].text
        except:
            name = 'no name'

            
        try:
            name = i.find_all("span","wprm-recipe-ingredient-name")
            name = name[0].text
        except:
            name = 'no name'



        all_ingredients.append({'url': url, 
                                'recipe_name': recipe_name,
                                'recipe_img': recipe_img,
                                'amount': amount, 
                                'unit': unit, 
                                'name': name,
                                'rating': rating, 
                                'vote_count': vote_count})

        
    
    return all_ingredients

get_recipe_ingredients(url)

FOR HBH

df = pd.read_csv('halfbakedharvest_links_with_ingredients.csv')

df = df[df['ingredients'] != 'no data']
test = df['ingredients']
ing_dict = [y for x in test for y in eval(x)]
ing_df = pd.DataFrame(ing_dict)
ing_df['recipe_id'] = ing_df.apply(lambda x: x['url'].split('/')[-1], axis=1)
ing_df.to_csv('halfbakedharvest_ingredients_only_v2.csv', index=False)

FOR MB

df = pd.read_csv('minimalist_baker_with_yoast.csv')

df = df[df['yoast_data'] != 'no data']
test = df['yoast_data']
ing_dict = [eval(x) for x in test]
ing_df = pd.DataFrame(ing_dict)

Share on

Twitter Facebook LinkedIn

Daniel Caraway

daily log 11.19.20

How to scrape data from your favorite recipe blogs so you can sort by ingredients you have and how much time you want to spend!!

WORKS FOR BOTH!!

FOR HBH

FOR MB

Share on

You may also enjoy

daily log 03-25-21

How to use Data Science Superpowers for Useless Things: Getting a Job at Amazon, Take 2

How to use Data Science Superpowers for Useless Things: Getting a Job at Amazon

How to use Data Science Superpowers for Useless Things: Adding Text to Images (aka Cats Narrate the Big Lebowski)