daily log 12.16.20
THINGS I LEARNED TODAY
- If getting errors with
eval
Usejson.loads
import json
url = "https://www.halfbakedharvest.com/one-pan-four-cheese-sun-dried-tomato-and-spinach-drunken-pasta-bake/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
searched_word = 'wprmpuc_recipe_'
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
clean_result = results[0].split('=')[1].split(';')[0].strip()
info_dict = json.loads(clean_result)
ANNA EXTENSION
IMPETUS: We want to be able to aggregate recipe data
PROBLEM: All recipe blogs are slightly different, format-wise
SOLUTION: Find something all recipe pages in all recipe blogs have in common – THE WORD PRINT!!
PROBLEM: How… how will we find this word in each page in such a way it can be used across multiple different blogs?
SOLUTION: Beautiful soup and regex!!
url = "https://minimalistbaker.com/orange-cranberry-crisp-gluten-free-easy/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
searched_word = 'Print'
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
results[0].parent['href']
ALL TOGETHER NOW!!
# url = "https://www.gimmesomeoven.com/poblano-white-chicken-chili/"
# url = "https://www.skinnytaste.com/lentil-soup-with-butternut-and-kale/"
# url = "https://minimalistbaker.com/orange-cranberry-crisp-gluten-free-easy/"
# NOPE url = "https://www.halfbakedharvest.com/one-pan-four-cheese-sun-dried-tomato-and-spinach-drunken-pasta-bake/"
# url = "https://www.twopeasandtheirpod.com/magic-cookie-bars/"
# url = "https://thedefineddish.com/miso-roasted-chicken/"
# url = "https://www.ambitiouskitchen.com/coconut-curried-brown-rice/"
# url = "https://whatsgabycooking.com/chicken-larb-bowls/"
# url = "https://paleomg.com/paleo-blueberry-chai-muffins/"
# NOPE url = "https://pinchofyum.com/30-minute-vegetarian-meatballs"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
searched_word = 'Print'
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
results[0].parent['href']
PROBLEM: Ok great! We found the word PRINT, but… now what??
SIDE QUEST PROBLEM:
Doesn’t work for HBH
Quick workaround?
import json
url = "https://www.halfbakedharvest.com/one-pan-four-cheese-sun-dried-tomato-and-spinach-drunken-pasta-bake/"
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
searched_word = 'wprmpuc_recipe_'
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
clean_result = results[0].split('=')[1].split(';')[0].strip()
info_dict = json.loads(clean_result)
info_dict
GET INGREDIENTS
multi_blogs = ["https://www.gimmesomeoven.com/poblano-white-chicken-chili/",
"https://www.skinnytaste.com/lentil-soup-with-butternut-and-kale/",
"https://minimalistbaker.com/orange-cranberry-crisp-gluten-free-easy/",
"https://www.twopeasandtheirpod.com/magic-cookie-bars/",
"https://thedefineddish.com/miso-roasted-chicken/",
"https://www.ambitiouskitchen.com/coconut-curried-brown-rice/",
"https://whatsgabycooking.com/chicken-larb-bowls/",
"https://paleomg.com/paleo-blueberry-chai-muffins/"]
def get_print_link(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
searched_word = 'Print'
results = soup.body.find_all(string=re.compile('.*{0}.*'.format(searched_word)), recursive=True)
print('Found the word "{0}" {1} times'.format(searched_word, len(results)))
return results[0].parent['href']
print_links = []
for blog in multi_blogs:
print_links.append(get_print_link(blog))
def get_ingredients_from_link(url):
r = requests.get(url)
soup = BeautifulSoup(r.content, 'html.parser')
# recipe_name = soup.find_all('h3', 'wprm-recipe-name')[0].text.strip()
recipe_name = 'test'
ingredients = soup.find_all('li',"wprm-recipe-ingredient")
all_ingredients = []
for i in ingredients:
try:
amount = i.find_all("span","wprm-recipe-ingredient-amount")
amount = amount[0].text
# print(amount[0].text)
except:
amount = 'no amount'
print('no amount')
try:
unit = i.find_all("span","wprm-recipe-ingredient-unit")
unit = unit[0].text
# print(unit[0].text)
except:
unit = 'no unit'
print('no unit')
try:
name = i.find_all("span","wprm-recipe-ingredient-name")
name = name[0].text
# print(name[0].text)
except:
name = 'no name'
print('no name')
all_ingredients.append({'url': url,
'recipe_name': recipe_name,
'amount': amount,
'unit': unit,
'name': name})
print(all_ingredients)
return all_ingredients
not_working = []
for link in print_links:
print('=============================================')
print(link)
ingredients = get_ingredients_from_link(link)
if len(ingredients) == 0:
not_working.append(link)
GET SHOPPING LIST
# result = [format_ingredients(x) for x in sm_df['ingredients']]
shopping_list = {}
def add_ingredients_to_dictionary(formatted_ingredient):
# print(formatted_ingredient)
ingredient = formatted_ingredient['name']
amount = formatted_ingredient['amount']
unit = formatted_ingredient['unit']
amount_unit = "{}({})".format(amount, unit)
if ingredient in shopping_list:
shopping_list[ingredient] = shopping_list[ingredient] + ' + ' + amount_unit
else:
shopping_list[ingredient] = amount_unit
my_list = [add_ingredients_to_dictionary(x) for x in results_flattened]