Skip to content

Instantly share code, notes, and snippets.

@ijharulislam
Created October 4, 2017 09:52
Show Gist options
  • Save ijharulislam/742805dd8e836e2c11d348d0f0b02dbf to your computer and use it in GitHub Desktop.
Save ijharulislam/742805dd8e836e2c11d348d0f0b02dbf to your computer and use it in GitHub Desktop.
import urllib.parse
import urllib.request
import json
input_data = {
'format':
[
{
'item_name': 'name',
'xpath': '//*[@id="topicPath"]/li[2]/text()',
'regex_match': 'Null',
'regex_item': 'Null',
'remove_tag': ''},
{
'item_name': 'zip',
'xpath': '//*[@id="header"]/p',
'regex_match': '〒([\d\-]+)\s*',
'regex_item': '$1 ',
'remove_tag': ''
},
{
'item_name': 'address',
'xpath': '//*[@id="header"]/p',
'regex_match': '〒[\d\-]+\s*(.+)\s*TEL',
'regex_item': '$1 ',
'remove_tag': ''},
{
'item_name': 'tel',
'xpath': '//*[@id="header"]/p',
'regex_match': '〒[\d\-]+\s*.+\s*TEL\s*\:*\s*([\d\-\(\)\s]+)',
'regex_item': '$1',
'remove_tag': ''}
],
'target': [
{
'id': '1',
'url': 'https://www.takashimaya.co.jp/rakusai/index.html'
},
{
'id': '2',
'url': 'https://www.takashimaya.co.jp/okayama/index.html'
},
{
'id': '3',
'url': 'https://www.takashimaya.co.jp/tachikawa/index.html'},
{
'id': '4',
'url': 'https://www.takashimaya.co.jp/yokohama/index.html'}
],
'coordination':[
{
"item_name": "coordination",
"mapurl_xpath": '//*[@id="storeInfo"]/div/p/a',
"xpath": '//*[@id="mapDiv"]/div/div/div[10]/div/div/div/div[7]/div/a',
"regex_match": "https:\/\/maps\.google\.com\/maps\?.*ll=([\d\.]+),([\d\.]+)",
"lat": "$1",
"lon": "$2",
"mapurl_regex": "\/[^\/]+\/access\/index.html"
}
]
}
# input_data = {"coordination": [], "target": [{"id": "1", "url": "http://locations.arbys.com/us/wi/stoughton/900-nygaard-street.html"}, {"id": "2", "url": "http://locations.arbys.com/us/nm/raton/415-clayton-hwy.html"}, {"id": "3", "url": "http://locations.arbys.com/us/ks/mission/6780-johnson-dr.html"}, {"id": "4", "url": "http://locations.arbys.com/us/al/eufaula/815-south-eufaula-ave.html"}], "format": [{"remove_tag": "", "xpath": "//*[@id=\"location-name\"]", "regex_match": ">([^<]+)<", "regex_item": "$1", "item_name": "name"}, {"remove_tag": "", "xpath": "//*[@id=\"address\"]/span[3]", "regex_match": "\\s*(\\d+)", "regex_item": "$1 ", "item_name": "zip"}, {"remove_tag": "", "xpath": "//*[@id=\"address\"]", "regex_match": "class=\"c-address-street-1\"\\s*>\\s*([^<]+)\\s*<.+itemprop=\"addressLocality\"\\s*>\\s*([^<]+)\\s*<.+itemprop=\"addressRegion\"\\s*>\\s*([^<]+)", "regex_item": "$1 $2, $3", "item_name": "address"}, {"remove_tag": "", "xpath": "//*[@id=\"logistics\"]/div/div/div[2]/div[2]/div[3]/div/a", "regex_match": ">([\\(\\w\\)\\s\\-]+)<", "regex_item": "$1", "item_name": "tel"}]}
# input_data = {"target": [{"id": "1", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=89"}, {"id": "2", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=124"}, {"id": "3", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=38"}, {"id": "4", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=12"}, {"id": "5", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=91"}], "coordination": [{"regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "srid": "", "xpath": "//*[@id=\"gmap\"]/div/div/div[2]/a", "mapurl_regex": "", "lat": "$2", "lon": "$1", "tinymapurl_regex": "", "mapurl_xpath": ""}], "format": [{"regex_match": "Null", "regex_item": "Null", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/header[3]/div/h2/text()", "item_name": "name"}, {"regex_match": "\u3012([\\d\\-]+)", "regex_item": "$1 ", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[1]", "item_name": "zip"}, {"regex_match": "\u3012[\\d\\-]+<br>([^<]+)", "regex_item": "$1 ", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[1]", "item_name": "address"}, {"regex_match": "Null", "regex_item": "Null", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[2]/text()", "item_name": "tel"}, {"regex_match": "<dd[^>]+>([^<]+)(?:<br>)*([^<]+)</dd>", "regex_item": "$1 $2", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[3]", "item_name": "hour"}]}
input_data = {"coordination": [{"lat": "$2", "mapurl_regex": "", "tinymapurl_regex": "", "mapurl_xpath": "", "regex_match": "https:\\/\\/map\\.yahoo\\.co\\.jp\\/maps\\?.*\\&lat=([\\-\u00a5d\u00a5.]+)&.*\\&lon=([\\-\u00a5d\u00a5.]+)", "xpath": "//*[@id=yolp-logo-link\"]", "srid": "", "lon": "$1"}], "format": [{"xpath": "//*[@id=\"shop_detail\"]/h2/text()", "item_name": "name", "regex_item": "Null", "remove_tag": "", "regex_match": "Null"}, {"xpath": "Null", "item_name": "zip", "regex_item": "Null", "remove_tag": "", "regex_match": "Null"}, {"xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[3]/td/text()", "item_name": "address", "regex_item": "Null", "remove_tag": "", "regex_match": "Null"}, {"xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[4]/td/text()", "item_name": "tel", "regex_item": "Null", "remove_tag": "", "regex_match": "Null"}], "target": [{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1362", "id": "1"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1363", "id": "2"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1358", "id": "3"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1359", "id": "4"}]}
input_data = {"target": [
{
"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1362",
"id": "1"},
{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1363",
"id": "2"},
{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1358",
"id": "3"},
{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1359", "id": "4"}],
"coordination": [{"regex_match": "https:\\/\\/map\\.yahoo\\.co\\.jp\\/maps\\?.*\\&lat=([\\-\u00a5d\u00a5.]+)&.*\\&lon=([\\-\u00a5d\u00a5.]+)",
"mapurl_xpath": "",
"xpath": '//*[@id="yolp-logo-link"]',
"lon": "$1", "srid": "",
"lat": "$2", "tinymapurl_regex": "",
"mapurl_regex": ""}
],
"format": [{"regex_item": "Null", "item_name": "name", "regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/h2/text()"}, {"regex_item": "Null", "item_name": "zip", "regex_match": "Null", "remove_tag": "", "xpath": "Null"}, {"regex_item": "Null", "item_name": "address", "regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[3]/td/text()"}, {"regex_item": "Null", "item_name": "tel", "regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[4]/td/text()"}]}
input_data = {
"format":
[
{
"regex_item": "Null",
"xpath": "//*[@id=\"titleH\"]/h2/text()",
"regex_match": "Null",
"remove_tag": "",
"item_name": "name"
},
{"regex_item": "Null", "xpath": "Null", "regex_match": "Null", "remove_tag": "", "item_name": "zip"}, {"regex_item": "Null", "xpath": "//*[@id=\"titleC\"]/div[1]/table/tbody/tr[4]/td/text()", "regex_match": "Null", "remove_tag": "", "item_name": "address"}, {"regex_item": "Null", "xpath": "//*[@id=\"titleC\"]/div[1]/table/tbody/tr[2]/td/text()", "regex_match": "Null", "remove_tag": "", "item_name": "tel"}], "target": [{"url": "http://www.uny.co.jp/shop/207/index.html", "id": "1"}, {"url": "http://www.uny.co.jp/shop/105/index.html", "id": "2"}, {"url": "http://www.uny.co.jp/shop/135/index.html", "id": "3"}, {"url": "http://www.uny.co.jp/shop/150/index.html", "id": "4"}],
"coordination": [
{
"mapurl_regex": "\\/shop\\/[^\\/]+\\/access\\.html",
"xpath": "//*[@id=\"map\"]/div/div/div[2]/a",
"lat": "$2",
"lon": "$1",
"tinymapurl_regex": "javascript:void\\(0\\);",
"mapurl_xpath": "//*[@id=\"accessMap\"]/a",
"regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)",
"srid": ""}
]}
input_data = {"target": [{"id": "1", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=89"}, {"id": "2", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=124"}, {"id": "3", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=38"}, {"id": "4", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=12"}, {"id": "5", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=91"}], "format": [{"xpath": "//*[@id=\"storeMap\"]/header[3]/div/h2/text()", "regex_item": "Null", "item_name": "name", "remove_tag": "", "regex_match": "Null"}, {"xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[1]", "regex_item": "$1 ", "item_name": "zip", "remove_tag": "", "regex_match": "\u3012([\\d\\-]+)"}, {"xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[1]", "regex_item": "$1 ", "item_name": "address", "remove_tag": "", "regex_match": "\u3012[\\d\\-]+<br>([^<]+)"}, {"xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[2]/text()", "regex_item": "Null", "item_name": "tel", "remove_tag": "", "regex_match": "Null"}, {"xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[3]", "regex_item": "$1 $2", "item_name": "hour", "remove_tag": "", "regex_match": "<dd[^>]+>([^<]+)(?:<br>)*([^<]+)</dd>"}], "coordination": [{"tinymapurl_regex": "", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "lon": "$1", "mapurl_regex": "", "lat": "$2", "xpath": "//*[@id=\"gmap\"]/div/div/div[2]/a", "mapurl_xpath": "", "srid": ""}]}
input_data = {"target": [{"url": "https://map.yoshinoya.com/p/shopmap/dtl/1320/?&his=al1,nm", "id": "1"}, {"url": "https://map.yoshinoya.com/p/shopmap/dtl/1348/?&his=al1,nm", "id": "2"}, {"url": "https://map.yoshinoya.com/p/shopmap/dtl/979/?&his=al1,nm", "id": "3"}, {"url": "https://map.yoshinoya.com/p/shopmap/dtl/1371/?&his=al1,nm", "id": "4"}], "format": [{"regex_item": "$1 ", "xpath": "/html/body/div/div[2]/div/div/div[1]/h1", "remove_tag": "", "item_name": "name", "regex_match": "<img[^>]+>\\s*(.+)\\s*"}, {"regex_item": "Null", "xpath": "/html/body/div/div[2]/div/div/div[2]/dl[1]/dd/ul/li[1]/text()", "remove_tag": "", "item_name": "zip", "regex_match": "Null"}, {"regex_item": "Null", "xpath": "/html/body/div/div[2]/div/div/div[2]/dl[1]/dd/ul/li[2]/text()", "remove_tag": "", "item_name": "address", "regex_match": "Null"}, {"regex_item": "Null", "xpath": "/html/body/div/div[2]/div/div/div[2]/dl[2]/dd/text()", "remove_tag": "", "item_name": "tel", "regex_match": "Null"}], "coordination": [{"mapurl_regex": "", "xpath": "//*[@id=\"ZdcEmapMap\"]/div/div/div[2]/a", "srid": "", "tinymapurl_regex": "", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "lat": "$2", "lon": "$1", "mapurl_xpath": ""}]}
input_data = {"coordination": [{"lat": "$2", "srid": "4301", "lon": "$1", "mapurl_xpath": "", "xpath": "//*[@id=\"MapiMapLink\"]/a", "mapurl_regex": "", "tinymapurl_regex": "", "regex_match": "\\/m\u00a5/saintmarc\u00a5/([\\-\u00a5d\u00a5.]+)_([\\-\u00a5d\u00a5.]+)_[\u00a5d\u00a5.]+\u00a5/\u00a5?brand_type=CFE"}], "target": [{"url": "https://www.saint-marc-hd.com/b/saintmarc/info/20265/?brand_type=CFE", "id": "1"}, {"url": "https://www.saint-marc-hd.com/b/saintmarc/info/20347/?brand_type=CFE", "id": "2"}, {"url": "https://www.saint-marc-hd.com/b/saintmarc/info/20364/?brand_type=CFE", "id": "3"}, {"url": "https://www.saint-marc-hd.com/b/saintmarc/info/20393/?brand_type=CFE", "id": "4"}], "format": [{"xpath": "//*[@id=\"MapiContainer\"]/div/h1", "remove_tag": "", "item_name": "name", "regex_item": "$1 ", "regex_match": "<br>\"*\\s*(.*)\\s*\"*<"}, {"xpath": "//*[@id=\"js-info_area\"]/table[1]/tbody/tr[1]/td", "remove_tag": "", "item_name": "zip", "regex_item": "$1 ", "regex_match": "\u3012\\s*([\\d-]+)\\s*<"}, {"xpath": "//*[@id=\"js-info_area\"]/table[1]/tbody/tr[1]/td", "remove_tag": "", "item_name": "address", "regex_item": "$1 ", "regex_match": "<br>(.+)$"}, {"xpath": "//*[@id=\"js-info_area\"]/table[1]/tbody/tr[2]/td/ul/li/text()", "remove_tag": "", "item_name": "tel", "regex_item": "$1 ", "regex_match": "TEL\uff1a([\\d\\-]+)"}]}
#"113_DoutorCoffee"
input_data = {"target": [{"url": "http://sasp.mapion.co.jp/b/doutor/info/01010312/", "id": "1"}, {"url": "http://sasp.mapion.co.jp/b/doutor/info/01011384/", "id": "2"}, {"url": "http://sasp.mapion.co.jp/b/doutor/info/02011541/", "id": "3"}, {"url": "http://sasp.mapion.co.jp/b/doutor/info/01010074/", "id": "4"}], "format": [{"xpath": "//*[@id=\"MapiContainer\"]/div/h1/text()", "item_name": "name", "regex_item": "$1 ", "regex_match": "\\s*(.+)\\s*$", "remove_tag": ""}, {"xpath": "Null", "item_name": "zip", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"xpath": "//*[@id=\"MapiDataArea\"]/div/table/tbody/tr[1]/td/text()", "item_name": "address", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"xpath": "//*[@id=\"MapiDataArea\"]/div/table/tbody/tr[2]/td/text()", "item_name": "tel", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}], "coordination": [{"xpath": "//*[@id=\"MapiMapLink\"]/a", "mapurl_regex": "", "srid": "4301", "lon": "$1", "lat": "$2", "regex_match": "\\/m\u00a5/doutor\u00a5/([\\-\u00a5d\u00a5.]+)_([\\-\u00a5d\u00a5.]+)_[\u00a5d\u00a5.]+\u00a5/", "mapurl_xpath": "", "tinymapurl_regex": ""}]}
#114_SKYLARK
input_data = {"target": [{"id": "1", "url": "https://store-info.skylark.co.jp/skylark/spot/detail?bc=eNqLVnq2ZM7zLYseN3c8bm5%2B3DxdSUcJiJ62bn26a%2FXzOT1AdmJRaqJ%2BTmZxiX1xamJRckZMqYGBcUpiSkpRanExiGNkhsQxTjGygAjGA4VyIEIlRaWpIFPnbHg6e%2FPjxnVPdqx9ur%2F5cWP346ZOiP1A2eKC%2FBKwPfq5iQUodiXn5xelYLXJyNAIu2WxAJOvWf4%3D&code=012771&_resl=true"}, {"id": "2", "url": "https://store-info.skylark.co.jp/skylark/spot/detail?bc=eNqLVnq2ZM7zLYseN3c8bm5%2B3DxdSUcJiJ4v63%2Bya%2BrzOT1AdmJRaqJ%2BTmZxiX1xamJRckZMqYGBcUpiSkpRanExiGNkhsQxTjG0gAjGA4VyIEIlRaWpQJOeztnwdPbmx43rnuxY%2B3R%2F8%2BPG7sdNnRD7gbLFBfklYHv0cxMLUOxKzs8vSsFqk5GBJXbLYgGAhFno&code=011475&_resl=true"}, {"id": "3", "url": "https://store-info.skylark.co.jp/skylark/spot/detail?bc=eNqLVnq2ZM7zLYseN3c8bm5%2B3DxdSUcJiJ7uan66edvzOT1AdmJRaqJ%2BTmZxiX1xamJRckZMqYGBcUpiSkpRanExiGNkhsQxTjE2gQjGA4VyIEIlRaWpIFPnbHg6e%2FPjxnVPdqx9ur%2F5cWP346ZOiP1A2eKC%2FBKwPfq5iQUodiXn5xelYLXJ2ACHZbEAl2RaAQ%3D%3D&code=012840&_resl=true"}, {"id": "4", "url": "https://store-info.skylark.co.jp/skylark/spot/detail?bc=eNqLVnq2ZM7zLYseN3c8bm5%2B3DxdSUcJiJ7u2%2Fx087bnc3qA7MSi1ET9nMziEvvi1MSi5IyYUgMD45TElJSi1OJiEMfIDIljnGJsBhGMBwrlQIRKikpTQabO2fB09ubHjeue7Fj7dH%2Fz48bux02dEPuBssUF%2BSVge%2FRzEwtQ7ErOzy9KwWqTkYEhdstiAb9GWjU%3D&code=012861&_resl=true"}], "format": [{"regex_match": "Null", "regex_item": "Null", "xpath": "//*[@id=\"w_1_detail_1_1-spot-name\"]/text()", "remove_tag": "", "item_name": "name"}, {"regex_match": "Null", "regex_item": "Null", "xpath": "//*[@id=\"w_1_detail_1_1_2-tbody\"]/tr[1]/td[2]/span[1]/text()", "remove_tag": "", "item_name": "zip"}, {"regex_match": "Null", "regex_item": "Null", "xpath": "//*[@id=\"w_1_detail_1_1_2-beforeAddressSpace\"]/text()", "remove_tag": "", "item_name": "address"}, {"regex_match": "Null", "regex_item": "Null", "xpath": "//*[@id=\"w_1_detail_1_1_2-tbody\"]/tr[2]/td[2]/span/text()", "remove_tag": "", "item_name": "tel"}], "coordination": [{"regex_match": "http:\\/\\/maps\\.google\\.com\\/maps\\?q=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "tinymapurl_regex": "", "srid": "", "mapurl_regex": "", "lat": "$2", "mapurl_xpath": "", "xpath": "//*[@id=\"w_7_imagelink_1_1_7-label-upper\"]/span/a", "lon": "$1"}]}
#115_Saizeriya
input_data = {"format": [{"regex_match": "Null", "item_name": "name", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/h2/text()", "regex_item": "Null"}, {"regex_match": "Null", "item_name": "zip", "remove_tag": "", "xpath": "Null", "regex_item": "Null"}, {"regex_match": "Null", "item_name": "address", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[3]/td/text()", "regex_item": "Null"}, {"regex_match": "Null", "item_name": "tel", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[4]/td/text()", "regex_item": "Null"}], "target": [{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1362", "id": "1"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1363", "id": "2"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1358", "id": "3"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1359", "id": "4"}], "coordination": [{"regex_match": "https:\\/\\/map\\.yahoo\\.co\\.jp\\/maps\\?.*\\;lat=([\\-\u00a5d\u00a5.]+)&.*\\;lon=([\\-\u00a5d\u00a5.]+)", "tinymapurl_regex": "", "srid": "", "lon": "$1", "mapurl_xpath": "", "lat": "$2", "mapurl_regex": "", "xpath": "//*[@id=\"yolp-logo-link\"]"}]}
#116_APITA
input_data = {"target": [{"id": "1", "url": "http://www.uny.co.jp/shop/207/index.html"}, {"id": "2", "url": "http://www.uny.co.jp/shop/105/index.html"}, {"id": "3", "url": "http://www.uny.co.jp/shop/135/index.html"}, {"id": "4", "url": "http://www.uny.co.jp/shop/150/index.html"}], "coordination": [{"xpath": "//*[@id=\"map\"]/div/div/div[2]/a", "mapurl_xpath": "//*[@id=\"accessMap\"]/a", "lat": "$2", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "tinymapurl_regex": "javascript:void\\(0\\);", "lon": "$1", "mapurl_regex": "\\/shop\\/[^\\/]+\\/access\\.html", "srid": ""}], "format": [{"item_name": "name", "xpath": "//*[@id=\"titleH\"]/h2/text()", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"item_name": "zip", "xpath": "Null", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"item_name": "address", "xpath": "//*[@id=\"titleC\"]/div[1]/table/tbody/tr[4]/td/text()", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"item_name": "tel", "xpath": "//*[@id=\"titleC\"]/div[1]/table/tbody/tr[2]/td/text()", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}]}
#117_Ito-Yokado
input_data = {"target": [{"url": "http://blog.itoyokado.co.jp/shop/765/index.html", "id": "1"}, {"url": "http://blog.itoyokado.co.jp/shop/225/index.html", "id": "2"}, {"url": "http://blog.itoyokado.co.jp/shop/196/index.html", "id": "3"}, {"url": "http://blog.itoyokado.co.jp/shop/078/index.html", "id": "4"}], "coordination": [{"mapurl_xpath": "", "xpath": "//*[@id=\"shop_content_column\"]/div[5]/div/div[1]/p[1]/a", "lon": "$1", "lat": "$2", "regex_match": "\\wd([\\-\u00a5d\u00a5.]+)\\!\\wd([\\-\u00a5d\u00a5.]+)", "mapurl_regex": "", "tinymapurl_regex": "https:\\/\\/goo\\.gl\\/maps\\/\\w+", "srid": ""}], "format": [{"item_name": "name", "regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"shop_content_column\"]/div[1]/h2/text()", "regex_item": "Null"}, {"item_name": "zip", "regex_match": "\u3012([\\d\\-]+)\\s*", "remove_tag": "", "xpath": "//*[@id=\"shop_content_column\"]/div[1]/div[1]/p[1]", "regex_item": "$1 "}, {"item_name": "address", "regex_match": "\u3012[\\d\\-]+\\s*\"*\\s*<br>\\s*\"*\\s*([^<]+)\\s*<\"*", "remove_tag": "", "xpath": "//*[@id=\"shop_content_column\"]/div[1]/div[1]/p[1]", "regex_item": "$1 "}, {"item_name": "tel", "regex_match": "\u96fb\u8a71\u756a\u53f7\uff1a([\\d\\-]+)", "remove_tag": "", "xpath": "//*[@id=\"shop_content_column\"]/div[1]/div[1]", "regex_item": "$1 "}]}
#118_ToyokoInn
input_data = {"format": [{"regex_match": "Null", "item_name": "name", "regex_item": "Null", "remove_tag": "", "xpath": "//*[@id=\"mainArea\"]/div/h1/span/em/a/text()"}, {"regex_match": "\u3012([\\d\\-]+)", "item_name": "zip", "regex_item": "$1 ", "remove_tag": "", "xpath": "//*[@id=\"mainArea\"]/div[2]/div[1]/p"}, {"regex_match": "\u3012[\\d\\-]+\\s*(?:&nbsp;)*([^<]+)<", "item_name": "address", "regex_item": "$1 ", "remove_tag": "", "xpath": "//*[@id=\"mainArea\"]/div[2]/div[1]/p"}, {"regex_match": "Null", "item_name": "tel", "regex_item": "Null", "remove_tag": "", "xpath": "//*[@id=\"mainArea\"]/div[2]/div[1]/dl[1]/dd/text()"}], "coordination": [{"regex_match": "https:\u00a5/\u00a5/www\u00a5.google\u00a5.com\u00a5/maps\u00a5/place\u00a5/.*\\/*\u00a5@([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "mapurl_xpath": "", "srid": "", "mapurl_regex": "", "lat": "$1", "lon": "$2", "tinymapurl_regex": "", "xpath": "//*[@id=\"mainArea\"]/section[3]/div/div/ul/li/a"}], "target": [{"url": "http://www.toyoko-inn.com/hotel/00264/index.html", "id": "1"}, {"url": "http://www.toyoko-inn.com/hotel/00232/index.html", "id": "2"}, {"url": "http://www.toyoko-inn.com/hotel/00126/index.html", "id": "3"}, {"url": "http://www.toyoko-inn.com/hotel/00067/index.html", "id": "4"}]}
#119_Takashimaya
input_data = {"coordination": [{"regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "mapurl_regex": "\u00a5/[^\u00a5/]+\u00a5/access\u00a5/index.html", "srid": "", "xpath": "//*[@id=\"mainArea\"]/section[2]/div/table/tbody/tr[1]/td[1]/ul/li/a", "tinymapurl_regex": "", "mapurl_xpath": "//*[@id=\"storeInfo\"]/div/p/a", "lat": "$1", "lon": "$2"}], "target": [{"id": "1", "url": "https://www.takashimaya.co.jp/rakusai/index.html"}, {"id": "2", "url": "https://www.takashimaya.co.jp/okayama/index.html"}, {"id": "3", "url": "https://www.takashimaya.co.jp/tachikawa/index.html"}, {"id": "4", "url": "https://www.takashimaya.co.jp/yokohama/index.html"}], "format": [{"xpath": "//*[@id=\"topicPath\"]/li[2]/text()", "regex_match": "Null", "item_name": "name", "remove_tag": "", "regex_item": "Null"}, {"xpath": "//*[@id=\"header\"]/p", "regex_match": "\u3012([\\d\\-]+)\\s*", "item_name": "zip", "remove_tag": "", "regex_item": "$1 "}, {"xpath": "//*[@id=\"header\"]/p", "regex_match": "\u3012[\\d\\-]+\\s*(.+)\\s*TEL", "item_name": "address", "remove_tag": "", "regex_item": "$1 "}, {"xpath": "//*[@id=\"header\"]/p", "regex_match": "\u3012[\\d\\-]+\\s*.+\\s*TEL\\s*\\:*\\s*([\\d\\-\\(\\)\\s]+)", "item_name": "tel", "remove_tag": "", "regex_item": "$1 "}]}
#120_SonicDrive-In
input_data = {
"target": [
{"id": "1", "url": "https://locations.sonicdrivein.com/id/emmett/650-highway-16.html"}, {"id": "2", "url": "https://locations.sonicdrivein.com/oh/nelsonville/1025-e--canal-street.html"}, {"id": "3", "url": "https://locations.sonicdrivein.com/nj/elizabeth/573-spring-street.html"}, {"id": "4", "url": "https://locations.sonicdrivein.com/sc/irmo/1150-dutch-fork-road.html"}],
"coordination": [
{
"lat": "$1",
"mapurl_xpath": '//*[@id="nap"]/div/div[2]/div[1]/a',
"srid": "",
"xpath": "",
"regex_match": "https:\u00a5/\u00a5/www\u00a5.google\u00a5.com\u00a5/maps\u00a5/place\u00a5/.+\u00a5@([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)",
"lon": "$2",
"mapurl_regex": ""}
], "format": [{"regex_item": "Null", "item_name": "name", "xpath": "//*[@id=\"nap\"]/div/div[2]/div[1]/h1/span[2]/text()", "regex_match": "Null", "remove_tag": ""}, {"regex_item": "Null", "item_name": "zip", "xpath": "//*[@id=\"nap\"]/div/div[2]/div[2]/div[2]/div[1]/div[1]/address/span[3]/text()", "regex_match": "Null", "remove_tag": ""}, {"regex_item": "$1 $2, $3", "item_name": "address", "xpath": "//*[@id=\"nap\"]/div/div[2]/div[2]/div[2]/div[1]/div[1]/address", "regex_match": "class=\"c\\-address\\-street\\-1\">\\s*([^<]+)\\s*<.+itemprop=\\\"addressLocality\\\">\\s*([^<]+)\\s*<.+itemprop=\"addressRegion\">\\s*([^<]+)\\s*", "remove_tag": ""}, {"regex_item": "$1 ", "item_name": "tel", "xpath": "//*[@id=\"telephone\"]", "regex_match": ">\\s*\"*([\\d\\-\\(\\)\\s]+)\"*\\s*<", "remove_tag": ""}]}
# #121_Arby's
# input_data = {"coordination": [], "target": [{"id": "1", "url": "http://locations.arbys.com/us/wi/stoughton/900-nygaard-street.html"}, {"id": "2", "url": "http://locations.arbys.com/us/nm/raton/415-clayton-hwy.html"}, {"id": "3", "url": "http://locations.arbys.com/us/ks/mission/6780-johnson-dr.html"}, {"id": "4", "url": "http://locations.arbys.com/us/al/eufaula/815-south-eufaula-ave.html"}], "format": [{"item_name": "name", "regex_item": "$1", "xpath": "//*[@id=\"location-name\"]", "remove_tag": "", "regex_match": ">([^<]+)<"}, {"item_name": "zip", "regex_item": "$1 ", "xpath": "//*[@id=\"address\"]/span[3]", "remove_tag": "", "regex_match": "\\s*(\\d+)"}, {"item_name": "address", "regex_item": "$1 $2, $3", "xpath": "//*[@id=\"address\"]", "remove_tag": "", "regex_match": "class=\"c-address-street-1\"\\s*>\\s*([^<]+)\\s*<.+itemprop=\"addressLocality\"\\s*>\\s*([^<]+)\\s*<.+itemprop=\"addressRegion\"\\s*>\\s*([^<]+)"}, {"item_name": "tel", "regex_item": "$1", "xpath": "//*[@id=\"telephone\"]", "remove_tag": "", "regex_match": ">([\\(\\w\\)\\s\\-]+)<"}]}
# #122_DUNNBROTHERSCOFFEE
# input_data = {"coordination": [{"mapurl_regex": "", "srid": "", "mapurl_xpath": "", "regex_match": "http:\\/\\/api\\.tiles\\.mapbox\\.com\\/v4\\/yext\\..+\\(([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)\\)\\/", "xpath": "//*[@id=\"schema-location\"]", "lon": "$1", "lat": "$2", "tinymapurl_regex": ""}], "format": [{"xpath": "//*[@id=\"location-name\"]/span[2]/text()", "item_name": "name", "regex_match": "Null", "remove_tag": "", "regex_item": "Null"}, {"xpath": "//*[@id=\"address\"]/span[3]/text()", "item_name": "zip", "regex_match": "Null", "remove_tag": "", "regex_item": "Null"}, {"xpath": "//*[@id=\"address\"]", "item_name": "address", "regex_match": "<span class=\"c\\-address\\-street\\-1\"[^>]*>([^<]+)<.+itemprop=\"addressLocality\"[^>]*>\\s*([^<]+)\\s*<.+itemprop=\"addressRegion\"[^>]*>\\s*(\\w{2})\\s*", "remove_tag": "", "regex_item": "$1 $2"}, {"xpath": "//*[@id=\"main\"]/div/header/div/div[3]/section/div[2]/div/div[2]/div/a/text()", "item_name": "tel", "regex_match": "Null", "remove_tag": "", "regex_item": "Null"}], "target": [{"url": "https://locations.dunnbrothers.com/tn/nashville/401-church-st.html", "id": "1"}, {"url": "https://locations.dunnbrothers.com/mn/minneapolis/601-marquette-ave.html", "id": "2"}, {"url": "https://locations.dunnbrothers.com/mn/hastings/919-vermillion-street.html", "id": "3"}, {"url": "https://locations.dunnbrothers.com/mn/apple-valley/15265-galaxie-ave.html", "id": "4"}]}
#123_StumptownCoffeeRoasters
input_data = {"format": [{"regex_item": "Null", "remove_tag": "", "regex_match": "Null", "xpath": "//*[@id=\"main\"]/div[1]/div[2]/div[2]/div[1]/span/text()", "item_name": "name"}, {"regex_item": "$1 ", "remove_tag": "", "regex_match": "\\w{2}\\s([\\d]{5})", "xpath": "//*[@id=\"main\"]/div[1]/div[2]/div[2]/div[1]/div/a/h3", "item_name": "zip"}, {"regex_item": "$1 $2 $3", "remove_tag": "", "regex_match": ">\\s*(.+)\\s*<br>\\s*([^,]+)\\s*,\\s*(\\w{2})", "xpath": "//*[@id=\"main\"]/div[1]/div[2]/div[2]/div[1]/div/a/h3", "item_name": "address"}, {"regex_item": "Null", "remove_tag": "", "regex_match": "Null", "xpath": "Null", "item_name": "tel"}],
"coordination": [{"srid": "", "mapurl_regex": "", "mapurl_xpath": "", "regex_match": "https:\u00a5/\u00a5/www\u00a5.google\u00a5.co\u00a5.jp\u00a5/maps\u00a5/place\u00a5/.+\\/\u00a5@([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "xpath": "//*[@id=\"main\"]/div[1]/div[2]/div[2]/div[1]/div", "lat": "$1", "tinymapurl_regex": "https:\\/\\/maps\\.google\\.com\\/\\?q=\\.\\+", "lon": "$2"}], "target": [{"id": "1", "url": "https://www.stumptowncoffee.com/locations/portland/ace-pdx"}, {"id": "2", "url": "https://www.stumptowncoffee.com/locations/portland/annex"}, {"id": "3", "url": "https://www.stumptowncoffee.com/locations/seattle/pine"}, {"id": "4", "url": "https://www.stumptowncoffee.com/locations/newyork/ace-nyc"}]}
#124_OUTBACKSTEAKHOUSE
# input_data = {"target": [{"url": "https://www.outback.com/locations/ar/conway", "id": "1"}, {"url": "https://www.outback.com/locations/ca/arcadia", "id": "2"}, {"url": "https://www.outback.com/locations/ky/bowling-green", "id": "3"}, {"url": "https://www.outback.com/locations/ny/buffalo-amherst", "id": "4"}], "format": [{"remove_tag": "", "item_name": "name", "regex_match": "Null", "xpath": "//*[@id=\"LocationDetailHeader\"]/div/div/div/div/div/div/h1/span/text()", "regex_item": "Null"}, {"remove_tag": "", "item_name": "zip", "regex_match": "Null", "xpath": "//*[@id=\"LocationDetailHeader\"]/div/div/div/div/div/ul/li[1]/span/p[1]/a[1]/span[4]/text()", "regex_item": "Null"}, {"remove_tag": "", "item_name": "address", "regex_match": "ng-bind=\"CurrentLocation.Address\"[^>]+>([^<]+)<[\\s\\S]+ng-bind=\"CurrentLocation.City\"[^>]+>([^<]+)<[\\s\\S]+ng-bind=\"CurrentLocation.State\"[^>]+>(\\w+)<", "xpath": "//*[@id=\"LocationDetailHeader\"]/div/div/div/div/div/ul/li[1]/span/p[1]/a[1]", "regex_item": "$1 $2, $3"}, {"remove_tag": "", "item_name": "tel", "regex_match": "Null", "xpath": "//*[@id=\"LocationDetailHeader\"]/div/div/div/div/div/ul/li[1]/span/p[1]/span/text()", "regex_item": "Null"}], "coordination": [{"lat": "$2", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "xpath": "//*[@id=\"LocationDetailMap\"]/div/div/div[2]/a", "lon": "$1", "srid": "", "tinymapurl_regex": "", "mapurl_xpath": "", "mapurl_regex": ""}]}
# #125_REDLOBSTER
# input_data = {"target": [{"url": "https://www.redlobster.com/locations/list/mt/billings/2250-king-ave-w/", "id": "1"}, {"url": "https://www.redlobster.com/locations/list/nv/las-vegas/200-s-decatur-blvd/", "id": "2"}, {"url": "https://www.redlobster.com/locations/list/de/talleyville/309-rocky-run-parkway/", "id": "3"}, {"url": "https://www.redlobster.com/locations/list/sd/rapid-city/120-disk-drive/", "id": "4"}], "format": [{"regex_match": "<br>\\s*([^,]+),\\s*(\\w{2})", "remove_tag": "", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[1]/h1", "item_name": "name", "regex_item": "$1 $2"}, {"regex_match": ",\\s\\w{2}\\s(\\d+)", "remove_tag": "", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[1]/p", "item_name": "zip", "regex_item": "$1 "}, {"regex_match": ">\\s*([^<]+)<br>\\s*([^,]+),\\s(\\w{2})", "remove_tag": "", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[1]/p", "item_name": "address", "regex_item": "$1 $2, $3"}, {"regex_match": "Phone:[\\s\\S]([\\(\\)\\s\\d\\-]+\\d)\\s", "remove_tag": "", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[2]/a", "item_name": "tel", "regex_item": "$1 "}], "coordination": [{"lon": "$2", "mapurl_xpath": "", "tinymapurl_regex": "", "mapurl_regex": "", "regex_match": "https:\\/\\/www\\.google\\.com\\/maps\\/dir\\/Current\\+Location\\/([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "srid": "", "lat": "$1", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[1]/a[1]"}]}
#126_WHOLEFOODSMARKET
input_data = {
"format": [
{"regex_match": "<h1 class=\"store-title\">([^<]+)<\\/h1>", "xpath": "//*[@id=\"content-top\"]", "regex_item": "$1 ", "remove_tag": "", "item_name": "name"}, {"regex_match": "class=\"postal-code\">([\\d\\-]+)<", "xpath": "//*[@id=\"content-top\"]", "regex_item": "$1 ", "remove_tag": "", "item_name": "zip"}, {"regex_match": "<div class=\"thoroughfare\">([^\\<]+)<\\/div>[\\s\\S]+<span class=\"locality\">([^<]+)<[^>]+>,\\s*[^>]+.(\\w{2})<", "xpath": "//*[@id=\"content-top\"]", "regex_item": "$1 $2, $3", "remove_tag": "", "item_name": "address"}, {"regex_match": ">\\s*P: (\\d+[\\.\\-\\s]\\d+[\\.\\-\\s]\\d+)\\s*<br>", "xpath": "//*[@id=\"content-top\"]/div", "regex_item": "$1 ", "remove_tag": "", "item_name": "tel"}],
"coordination": [
{"srid": "", "tinymapurl_regex": "", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "lon": "$1", "xpath": "//*[@id=\"block-store-store-location-map\"]/div/div/div/div[1]/div/div/div[2]/a", "mapurl_regex": "\\/store\\-locations\\/\\#store\\/\\d+",
"lat": "$2", "mapurl_xpath": '//*[@id="block-views-9921a8226dcbc528f004a3704cd5a7ff"]/div/div/div/div/div[2]/div[5]/div/a[1]'}], "target": [{"url": "http://www.wholefoodsmarket.com/stores/huntsville", "id": "1"}, {"url": "http://www.wholefoodsmarket.com/stores/chandler", "id": "2"}, {"url": "http://www.wholefoodsmarket.com/stores/scottsdale", "id": "3"}, {"url": "http://www.wholefoodsmarket.com/stores/sedona", "id": "4"},
{"url": "http://www.wholefoodsmarket.com/stores/2001marketstreet", "id": "5"}]}
#127_Wegmans
input_data = {"target": [{"id": "1", "url": "https://www.wegmans.com/stores/jamestown-ny.html"}, {"id": "2", "url": "https://www.wegmans.com/stores/john-glenn-ny.html"}, {"id": "3", "url": "https://www.wegmans.com/stores/johnson-city-ny.html"}, {"id": "4", "url": "https://www.wegmans.com/stores/auburn-ny.html"}], "coordination": [{"regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "mapurl_regex": "", "mapurl_xpath": "", "tinymapurl_regex": "", "lat": "$1", "xpath": "//*[@id=\"storeDetailsMap\"]/div/div/div[2]/a", "srid": "", "lon": "$2"}], "format": [{"regex_match": "Null", "item_name": "name", "regex_item": "Null", "remove_tag": "", "xpath": "/html/body/div[1]/div[2]/div/div[1]/div/div[1]/h1/text()"}, {"regex_match": "<br>\\s*[^,]+,\\s\\w{2}\\s(\\d+)", "item_name": "zip", "regex_item": "$1 ", "remove_tag": "", "xpath": "/html/body/div[1]/div[2]/div/div[1]/div/div[2]/div/address/span"}, {"regex_match": ">([\\w\\s]+)\\w[\\S\\s]*<br>\\s*([^,]+),\\s*(\\w{2})", "item_name": "address", "regex_item": "$1 $2, $3", "remove_tag": "", "xpath": "/html/body/div[1]/div[2]/div/div[1]/div/div[2]/div/address/span"}, {"regex_match": "Null", "item_name": "tel", "regex_item": "Null", "remove_tag": "", "xpath": "/html/body/div[1]/div[2]/div/div[1]/div/div[2]/div/div/span/text()"}]}
#128_OMNIHOTELS&RESORTS
input_data = {"target": [{"url": "https://www.omnihotels.com/hotels/san-francisco", "id": "1"}, {"url": "https://www.omnihotels.com/hotels/dallas-park-west", "id": "2"}, {"url": "https://www.omnihotels.com/hotels/frisco", "id": "3"}, {"url": "https://www.omnihotels.com/hotels/richmond", "id": "4"}], "format": [{"regex_match": "^[\\s\\t]*(.+)[\\s\\t]*$", "item_name": "name", "remove_tag": "1", "xpath": "//*[@id=\"hero-image-container\"]/div[2]/div/div/div[1]/div[1]/h1", "regex_item": "$1"}, {"regex_match": "^[\\s\\t]*(.+)[\\s\\t]*$", "item_name": "zip", "remove_tag": "1", "xpath": "//*[@id=\"maincontent\"]/div/div/div/div[2]/div[1]/span[4]", "regex_item": "$1"}, {"regex_match": "^[\\s\\t]*(.+)[\\s\\t]*$", "item_name": "address", "remove_tag": "2", "xpath": "//*[@id=\"maincontent\"]/div/div/div/div[2]/div[1]", "regex_item": "$1"}, {"regex_match": "^[\\s\\t]*(.+)[\\s\\t]*$", "item_name": "tel", "remove_tag": "1", "xpath": "//*[@id=\"maincontent\"]/div/div/div/div[2]/div[2]/span[2]/span", "regex_item": "$1"}], "coordination": [{"regex_match": "https:\\/\\/maps\\.google\\.com\\/maps\\?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "srid": "", "lat": "$1", "tinymapurl_regex": "", "mapurl_xpath": "//*[@id=\"maincontent\"]/div/div/div/div[2]/div[3]/a", "xpath": "//*[@id=\"maincontainer_0_contentsection_1_genericrightcolumn_1_pnlContent\"]/script[2]", "mapurl_regex": "\\/hotels\\/[^\\/]+\\/[^\\/]+\\/directions", "lon": "$2"}]}
#129_bloomingale's
input_data = {
"coordination": [
{"regex_match": "itemprop=\"latitude\" content=([\\-\u00a5d\u00a5.]+)\"\"\\S+itemprop=\"\"longitude\"\" content=\"([\\-\u00a5d\u00a5.]+)\"\" \"",
"mapurl_xpath": "",
"srid": "",
"lon": "$2",
"mapurl_regex": "",
"xpath": "//*[@id=\"main\"]/div/div[8]/div[2]/span",
"lat": "$1", "tinymapurl_regex": ""}
], "target": [{"id": "1", "url": "http://locations.bloomingdales.com/stanford"}, {"id": "2", "url": "http://locations.bloomingdales.com/glendale-galleria"}, {"id": "3", "url": "http://locations.bloomingdales.com/south-coast-plaza"}, {"id": "4", "url": "http://locations.bloomingdales.com/the-colonnade-outlet-at-sawgrass-mills"}], "format": [{"regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"location-name\"]", "regex_item": "Null", "item_name": "name"}, {"regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"address\"]/span[4]", "regex_item": "Null", "item_name": "zip"}, {"regex_match": "Null", "remove_tag": "1", "xpath": "//*[@id=\"address\"]", "regex_item": "Null", "item_name": "address"}, {"regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"telephone\"]", "regex_item": "Null", "item_name": "tel"}]}
#130_Kochlffel
input_data = {"format": [{"item_name": "name", "regex_item": "Null", "xpath": "//*[@id=\"restaurants\"]/div[1]/div[1]/div/h2/text()", "regex_match": "Null", "remove_tag": ""}, {"item_name": "zip", "regex_item": "$1 ", "xpath": "//*[@id=\"restaurants\"]/div[1]/div[1]/div", "regex_match": "<strong>Adresse:<\\/strong><br>[\\s\\S][^,]+,\\s*(\\d+)", "remove_tag": ""}, {"item_name": "address", "regex_item": "$1 ", "xpath": "//*[@id=\"restaurants\"]/div[1]/div[1]/div", "regex_match": "<strong>Adresse:<\\/strong><br>[\\s\\S]\\s*([^<]+)<br>", "remove_tag": ""}, {"item_name": "tel", "regex_item": "$1 ", "xpath": "//*[@id=\"restaurants\"]/div[1]/div[1]/div", "regex_match": "Telefon<\\/strong><br>[\\s\\S]+\\s([\\/\\d]+)<br>", "remove_tag": ""}], "coordination": [{"xpath": "//*[@id=\"google-map\"]/div/div/div[2]/a", "lat": "$2", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "mapurl_xpath": "", "tinymapurl_regex": "", "lon": "$1", "mapurl_regex": "", "srid": ""}], "target": [{"id": "1", "url": "http://www.kochloeffel.de/filialfinder/detail/?store=4"}, {"id": "2", "url": "http://www.kochloeffel.de/filialfinder/detail/?store=6"}, {"id": "3", "url": "http://www.kochloeffel.de/filialfinder/detail/?store=36"}, {"id": "4", "url": "http://www.kochloeffel.de/filialfinder/detail/?store=81"}]}
apiurl = "http://138.68.241.86:9080/crawl.json?spider_name=shop_info&url=http://www.dmoz.org/Computers/Programming/Languages/Ada&start_requests=true"
input_data = json.dumps(input_data)
input_data_values = urllib.parse.urlencode({ "input_data" : input_data })
full_url = apiurl + '&' + input_data_values
with urllib.request.urlopen(full_url) as response:
html = response.read()
print (html)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment