Skip to content

Instantly share code, notes, and snippets.

Last active May 13, 2018 13:26
Show Gist options
  • Save mkyt/e3934169bca276cea97a421f9585c802 to your computer and use it in GitHub Desktop.
Save mkyt/e3934169bca276cea97a421f9585c802 to your computer and use it in GitHub Desktop.
Extract recipes for cook4me express
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from urllib.request import urlopen
import re
import json
from pprint import pprint
from bs4 import BeautifulSoup
LIST_URL = BASE_URL + '/recipe/category/c4m-express/{:d}/'
DETAIL_URL = BASE_URL + '/recipe/detail/{:d}/'
LIST_PAGES = (1, 11)
def get(url):
return BeautifulSoup(urlopen(url).read().decode('utf-8'), 'lxml')
def extract_detail(soup, id_):
# id: int
# title: string
# cook_duration: int (in minutes)
# calorie: int (in kcal)
# genre : string
# difficulty : string
# prep_duration: int (in minutes)
# comment: string
# ingredients: list of (name: string * amount: string)
# yield: int
# instructions: list of string
# img_url: string
print('parsing detail id={}'.format(id_))
r = soup.select_one('div[itemtype=""]')
title = r.select_one('h2[itemprop="name"]').text
# assuming one image per recipe
img_url = BASE_URL + r.select_one('div#recipe_photo img').attrs['src']
# comment & prep_duration
s = r.select_one('p[itemprop="summary"]').text
# s:= '根菜をたっぷりとれるヘルシーおかず。\n\n【準備時間:15分】'
m ='【準備時間:(\d+)分】', s)
prep_duration = int( if m is not None else None
comment = s.split('【')[0].strip()
cook_duration = int(r.select_one('div#recipe_content ul li.r_time time').text.strip()[:-1])
calorie = int(r.select_one('div#recipe_content ul').text.split(':')[-1][:-4])
genre = r.select_one('div#recipe_content ul li.genre').text.split(':')[-1].strip()
difficulty = r.select_one('div#recipe_content ul li.level').text.split(':')[-1].strip()
instructions = [elem.text.strip() for elem in'div#recipe_howto ul li')]
m = re.match(r'\((\d+)人分', r.select_one('div[itemprop="ingredient"] span[itemprop="yield"]').text)
yield_ = int( if m is not None else None
ing_elem = r.select_one('div[itemprop="ingredient"] dl')
# special treatment for only_dt (where amount is omitted)
for elem in'dt.only_dt'):
if ')' in elem.text: # fix for ID: 1525
name, amt = elem.text.split(')')
elem.string = name + ')'
name, amt = elem.text, ''
dd = soup.new_tag('dd')
dd.string = amt
ingredients = [{'name': name.text.strip(), 'amount': amt.text.strip()} for name, amt in zip('dt'),'dd'))]
for ingredient in ingredients:
name = ingredient['name']
marking = None
if '-' in name:
name, marking = name.split('-')
ingredient['name'] = name.strip()
ingredient['marking'] = marking.strip()
if name.count('(') == 1:
name, detail = name.split('(')
if not detail.endswith(')'):
import pdb; pdb.set_trace()
detail = detail[:-1] # remove closing paren
ingredient['name'] = name.strip()
ingredient['detail'] = detail.strip()
elif '(' in ingredient:
import pdb; pdb.set_trace()
return {
'id': id_,
'title': title,
'cook_duration': cook_duration,
'prep_duration': prep_duration,
'img_url': img_url,
'comment': comment,
'calorie': calorie,
'genre': genre,
'difficulty': difficulty,
'instructions': instructions,
'yield': yield_,
'ingredients': ingredients
def extract_ids(soup):
res = []
for elem in'div.recipe_item p.text a'):
# elem := "<a href="/recipe/detail/1469/">野菜の肉巻き</a>"
return res
def obtain_detail(id_):
soup = get(DETAIL_URL.format(id_))
return extract_detail(soup, id_)
def obtain_all_ids():
res = []
for page in range(*LIST_PAGES):
soup = get(LIST_URL.format(page))
res += extract_ids(soup)
return sorted(res)
if __name__ == '__main__':
recipes = []
ids = obtain_all_ids()
for id_ in ids:
detail = obtain_detail(id_)
json.dump(recipes, open('recipes.json', 'w'), ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment