Created
October 4, 2017 09:52
-
-
Save ijharulislam/742805dd8e836e2c11d348d0f0b02dbf to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.parse | |
import urllib.request | |
import json | |
input_data = { | |
'format': | |
[ | |
{ | |
'item_name': 'name', | |
'xpath': '//*[@id="topicPath"]/li[2]/text()', | |
'regex_match': 'Null', | |
'regex_item': 'Null', | |
'remove_tag': ''}, | |
{ | |
'item_name': 'zip', | |
'xpath': '//*[@id="header"]/p', | |
'regex_match': '〒([\d\-]+)\s*', | |
'regex_item': '$1 ', | |
'remove_tag': '' | |
}, | |
{ | |
'item_name': 'address', | |
'xpath': '//*[@id="header"]/p', | |
'regex_match': '〒[\d\-]+\s*(.+)\s*TEL', | |
'regex_item': '$1 ', | |
'remove_tag': ''}, | |
{ | |
'item_name': 'tel', | |
'xpath': '//*[@id="header"]/p', | |
'regex_match': '〒[\d\-]+\s*.+\s*TEL\s*\:*\s*([\d\-\(\)\s]+)', | |
'regex_item': '$1', | |
'remove_tag': ''} | |
], | |
'target': [ | |
{ | |
'id': '1', | |
'url': 'https://www.takashimaya.co.jp/rakusai/index.html' | |
}, | |
{ | |
'id': '2', | |
'url': 'https://www.takashimaya.co.jp/okayama/index.html' | |
}, | |
{ | |
'id': '3', | |
'url': 'https://www.takashimaya.co.jp/tachikawa/index.html'}, | |
{ | |
'id': '4', | |
'url': 'https://www.takashimaya.co.jp/yokohama/index.html'} | |
], | |
'coordination':[ | |
{ | |
"item_name": "coordination", | |
"mapurl_xpath": '//*[@id="storeInfo"]/div/p/a', | |
"xpath": '//*[@id="mapDiv"]/div/div/div[10]/div/div/div/div[7]/div/a', | |
"regex_match": "https:\/\/maps\.google\.com\/maps\?.*ll=([\d\.]+),([\d\.]+)", | |
"lat": "$1", | |
"lon": "$2", | |
"mapurl_regex": "\/[^\/]+\/access\/index.html" | |
} | |
] | |
} | |
# input_data = {"coordination": [], "target": [{"id": "1", "url": "http://locations.arbys.com/us/wi/stoughton/900-nygaard-street.html"}, {"id": "2", "url": "http://locations.arbys.com/us/nm/raton/415-clayton-hwy.html"}, {"id": "3", "url": "http://locations.arbys.com/us/ks/mission/6780-johnson-dr.html"}, {"id": "4", "url": "http://locations.arbys.com/us/al/eufaula/815-south-eufaula-ave.html"}], "format": [{"remove_tag": "", "xpath": "//*[@id=\"location-name\"]", "regex_match": ">([^<]+)<", "regex_item": "$1", "item_name": "name"}, {"remove_tag": "", "xpath": "//*[@id=\"address\"]/span[3]", "regex_match": "\\s*(\\d+)", "regex_item": "$1 ", "item_name": "zip"}, {"remove_tag": "", "xpath": "//*[@id=\"address\"]", "regex_match": "class=\"c-address-street-1\"\\s*>\\s*([^<]+)\\s*<.+itemprop=\"addressLocality\"\\s*>\\s*([^<]+)\\s*<.+itemprop=\"addressRegion\"\\s*>\\s*([^<]+)", "regex_item": "$1 $2, $3", "item_name": "address"}, {"remove_tag": "", "xpath": "//*[@id=\"logistics\"]/div/div/div[2]/div[2]/div[3]/div/a", "regex_match": ">([\\(\\w\\)\\s\\-]+)<", "regex_item": "$1", "item_name": "tel"}]} | |
# input_data = {"target": [{"id": "1", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=89"}, {"id": "2", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=124"}, {"id": "3", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=38"}, {"id": "4", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=12"}, {"id": "5", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=91"}], "coordination": [{"regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "srid": "", "xpath": "//*[@id=\"gmap\"]/div/div/div[2]/a", "mapurl_regex": "", "lat": "$2", "lon": "$1", "tinymapurl_regex": "", "mapurl_xpath": ""}], "format": [{"regex_match": "Null", "regex_item": "Null", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/header[3]/div/h2/text()", "item_name": "name"}, {"regex_match": "\u3012([\\d\\-]+)", "regex_item": "$1 ", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[1]", "item_name": "zip"}, {"regex_match": "\u3012[\\d\\-]+<br>([^<]+)", "regex_item": "$1 ", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[1]", "item_name": "address"}, {"regex_match": "Null", "regex_item": "Null", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[2]/text()", "item_name": "tel"}, {"regex_match": "<dd[^>]+>([^<]+)(?:<br>)*([^<]+)</dd>", "regex_item": "$1 $2", "remove_tag": "", "xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[3]", "item_name": "hour"}]} | |
input_data = {"coordination": [{"lat": "$2", "mapurl_regex": "", "tinymapurl_regex": "", "mapurl_xpath": "", "regex_match": "https:\\/\\/map\\.yahoo\\.co\\.jp\\/maps\\?.*\\&lat=([\\-\u00a5d\u00a5.]+)&.*\\&lon=([\\-\u00a5d\u00a5.]+)", "xpath": "//*[@id=yolp-logo-link\"]", "srid": "", "lon": "$1"}], "format": [{"xpath": "//*[@id=\"shop_detail\"]/h2/text()", "item_name": "name", "regex_item": "Null", "remove_tag": "", "regex_match": "Null"}, {"xpath": "Null", "item_name": "zip", "regex_item": "Null", "remove_tag": "", "regex_match": "Null"}, {"xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[3]/td/text()", "item_name": "address", "regex_item": "Null", "remove_tag": "", "regex_match": "Null"}, {"xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[4]/td/text()", "item_name": "tel", "regex_item": "Null", "remove_tag": "", "regex_match": "Null"}], "target": [{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1362", "id": "1"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1363", "id": "2"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1358", "id": "3"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1359", "id": "4"}]} | |
input_data = {"target": [ | |
{ | |
"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1362", | |
"id": "1"}, | |
{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1363", | |
"id": "2"}, | |
{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1358", | |
"id": "3"}, | |
{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1359", "id": "4"}], | |
"coordination": [{"regex_match": "https:\\/\\/map\\.yahoo\\.co\\.jp\\/maps\\?.*\\&lat=([\\-\u00a5d\u00a5.]+)&.*\\&lon=([\\-\u00a5d\u00a5.]+)", | |
"mapurl_xpath": "", | |
"xpath": '//*[@id="yolp-logo-link"]', | |
"lon": "$1", "srid": "", | |
"lat": "$2", "tinymapurl_regex": "", | |
"mapurl_regex": ""} | |
], | |
"format": [{"regex_item": "Null", "item_name": "name", "regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/h2/text()"}, {"regex_item": "Null", "item_name": "zip", "regex_match": "Null", "remove_tag": "", "xpath": "Null"}, {"regex_item": "Null", "item_name": "address", "regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[3]/td/text()"}, {"regex_item": "Null", "item_name": "tel", "regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[4]/td/text()"}]} | |
input_data = { | |
"format": | |
[ | |
{ | |
"regex_item": "Null", | |
"xpath": "//*[@id=\"titleH\"]/h2/text()", | |
"regex_match": "Null", | |
"remove_tag": "", | |
"item_name": "name" | |
}, | |
{"regex_item": "Null", "xpath": "Null", "regex_match": "Null", "remove_tag": "", "item_name": "zip"}, {"regex_item": "Null", "xpath": "//*[@id=\"titleC\"]/div[1]/table/tbody/tr[4]/td/text()", "regex_match": "Null", "remove_tag": "", "item_name": "address"}, {"regex_item": "Null", "xpath": "//*[@id=\"titleC\"]/div[1]/table/tbody/tr[2]/td/text()", "regex_match": "Null", "remove_tag": "", "item_name": "tel"}], "target": [{"url": "http://www.uny.co.jp/shop/207/index.html", "id": "1"}, {"url": "http://www.uny.co.jp/shop/105/index.html", "id": "2"}, {"url": "http://www.uny.co.jp/shop/135/index.html", "id": "3"}, {"url": "http://www.uny.co.jp/shop/150/index.html", "id": "4"}], | |
"coordination": [ | |
{ | |
"mapurl_regex": "\\/shop\\/[^\\/]+\\/access\\.html", | |
"xpath": "//*[@id=\"map\"]/div/div/div[2]/a", | |
"lat": "$2", | |
"lon": "$1", | |
"tinymapurl_regex": "javascript:void\\(0\\);", | |
"mapurl_xpath": "//*[@id=\"accessMap\"]/a", | |
"regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", | |
"srid": ""} | |
]} | |
input_data = {"target": [{"id": "1", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=89"}, {"id": "2", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=124"}, {"id": "3", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=38"}, {"id": "4", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=12"}, {"id": "5", "url": "https://www.burgerkingjapan.co.jp/stores/detail.html?sn=91"}], "format": [{"xpath": "//*[@id=\"storeMap\"]/header[3]/div/h2/text()", "regex_item": "Null", "item_name": "name", "remove_tag": "", "regex_match": "Null"}, {"xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[1]", "regex_item": "$1 ", "item_name": "zip", "remove_tag": "", "regex_match": "\u3012([\\d\\-]+)"}, {"xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[1]", "regex_item": "$1 ", "item_name": "address", "remove_tag": "", "regex_match": "\u3012[\\d\\-]+<br>([^<]+)"}, {"xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[2]/text()", "regex_item": "Null", "item_name": "tel", "remove_tag": "", "regex_match": "Null"}, {"xpath": "//*[@id=\"storeMap\"]/section/div/div[1]/dl/dd[3]", "regex_item": "$1 $2", "item_name": "hour", "remove_tag": "", "regex_match": "<dd[^>]+>([^<]+)(?:<br>)*([^<]+)</dd>"}], "coordination": [{"tinymapurl_regex": "", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "lon": "$1", "mapurl_regex": "", "lat": "$2", "xpath": "//*[@id=\"gmap\"]/div/div/div[2]/a", "mapurl_xpath": "", "srid": ""}]} | |
input_data = {"target": [{"url": "https://map.yoshinoya.com/p/shopmap/dtl/1320/?&his=al1,nm", "id": "1"}, {"url": "https://map.yoshinoya.com/p/shopmap/dtl/1348/?&his=al1,nm", "id": "2"}, {"url": "https://map.yoshinoya.com/p/shopmap/dtl/979/?&his=al1,nm", "id": "3"}, {"url": "https://map.yoshinoya.com/p/shopmap/dtl/1371/?&his=al1,nm", "id": "4"}], "format": [{"regex_item": "$1 ", "xpath": "/html/body/div/div[2]/div/div/div[1]/h1", "remove_tag": "", "item_name": "name", "regex_match": "<img[^>]+>\\s*(.+)\\s*"}, {"regex_item": "Null", "xpath": "/html/body/div/div[2]/div/div/div[2]/dl[1]/dd/ul/li[1]/text()", "remove_tag": "", "item_name": "zip", "regex_match": "Null"}, {"regex_item": "Null", "xpath": "/html/body/div/div[2]/div/div/div[2]/dl[1]/dd/ul/li[2]/text()", "remove_tag": "", "item_name": "address", "regex_match": "Null"}, {"regex_item": "Null", "xpath": "/html/body/div/div[2]/div/div/div[2]/dl[2]/dd/text()", "remove_tag": "", "item_name": "tel", "regex_match": "Null"}], "coordination": [{"mapurl_regex": "", "xpath": "//*[@id=\"ZdcEmapMap\"]/div/div/div[2]/a", "srid": "", "tinymapurl_regex": "", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "lat": "$2", "lon": "$1", "mapurl_xpath": ""}]} | |
input_data = {"coordination": [{"lat": "$2", "srid": "4301", "lon": "$1", "mapurl_xpath": "", "xpath": "//*[@id=\"MapiMapLink\"]/a", "mapurl_regex": "", "tinymapurl_regex": "", "regex_match": "\\/m\u00a5/saintmarc\u00a5/([\\-\u00a5d\u00a5.]+)_([\\-\u00a5d\u00a5.]+)_[\u00a5d\u00a5.]+\u00a5/\u00a5?brand_type=CFE"}], "target": [{"url": "https://www.saint-marc-hd.com/b/saintmarc/info/20265/?brand_type=CFE", "id": "1"}, {"url": "https://www.saint-marc-hd.com/b/saintmarc/info/20347/?brand_type=CFE", "id": "2"}, {"url": "https://www.saint-marc-hd.com/b/saintmarc/info/20364/?brand_type=CFE", "id": "3"}, {"url": "https://www.saint-marc-hd.com/b/saintmarc/info/20393/?brand_type=CFE", "id": "4"}], "format": [{"xpath": "//*[@id=\"MapiContainer\"]/div/h1", "remove_tag": "", "item_name": "name", "regex_item": "$1 ", "regex_match": "<br>\"*\\s*(.*)\\s*\"*<"}, {"xpath": "//*[@id=\"js-info_area\"]/table[1]/tbody/tr[1]/td", "remove_tag": "", "item_name": "zip", "regex_item": "$1 ", "regex_match": "\u3012\\s*([\\d-]+)\\s*<"}, {"xpath": "//*[@id=\"js-info_area\"]/table[1]/tbody/tr[1]/td", "remove_tag": "", "item_name": "address", "regex_item": "$1 ", "regex_match": "<br>(.+)$"}, {"xpath": "//*[@id=\"js-info_area\"]/table[1]/tbody/tr[2]/td/ul/li/text()", "remove_tag": "", "item_name": "tel", "regex_item": "$1 ", "regex_match": "TEL\uff1a([\\d\\-]+)"}]} | |
#"113_DoutorCoffee" | |
input_data = {"target": [{"url": "http://sasp.mapion.co.jp/b/doutor/info/01010312/", "id": "1"}, {"url": "http://sasp.mapion.co.jp/b/doutor/info/01011384/", "id": "2"}, {"url": "http://sasp.mapion.co.jp/b/doutor/info/02011541/", "id": "3"}, {"url": "http://sasp.mapion.co.jp/b/doutor/info/01010074/", "id": "4"}], "format": [{"xpath": "//*[@id=\"MapiContainer\"]/div/h1/text()", "item_name": "name", "regex_item": "$1 ", "regex_match": "\\s*(.+)\\s*$", "remove_tag": ""}, {"xpath": "Null", "item_name": "zip", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"xpath": "//*[@id=\"MapiDataArea\"]/div/table/tbody/tr[1]/td/text()", "item_name": "address", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"xpath": "//*[@id=\"MapiDataArea\"]/div/table/tbody/tr[2]/td/text()", "item_name": "tel", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}], "coordination": [{"xpath": "//*[@id=\"MapiMapLink\"]/a", "mapurl_regex": "", "srid": "4301", "lon": "$1", "lat": "$2", "regex_match": "\\/m\u00a5/doutor\u00a5/([\\-\u00a5d\u00a5.]+)_([\\-\u00a5d\u00a5.]+)_[\u00a5d\u00a5.]+\u00a5/", "mapurl_xpath": "", "tinymapurl_regex": ""}]} | |
#114_SKYLARK | |
input_data = {"target": [{"id": "1", "url": "https://store-info.skylark.co.jp/skylark/spot/detail?bc=eNqLVnq2ZM7zLYseN3c8bm5%2B3DxdSUcJiJ62bn26a%2FXzOT1AdmJRaqJ%2BTmZxiX1xamJRckZMqYGBcUpiSkpRanExiGNkhsQxTjGygAjGA4VyIEIlRaWpIFPnbHg6e%2FPjxnVPdqx9ur%2F5cWP346ZOiP1A2eKC%2FBKwPfq5iQUodiXn5xelYLXJyNAIu2WxAJOvWf4%3D&code=012771&_resl=true"}, {"id": "2", "url": "https://store-info.skylark.co.jp/skylark/spot/detail?bc=eNqLVnq2ZM7zLYseN3c8bm5%2B3DxdSUcJiJ4v63%2Bya%2BrzOT1AdmJRaqJ%2BTmZxiX1xamJRckZMqYGBcUpiSkpRanExiGNkhsQxTjG0gAjGA4VyIEIlRaWpQJOeztnwdPbmx43rnuxY%2B3R%2F8%2BPG7sdNnRD7gbLFBfklYHv0cxMLUOxKzs8vSsFqk5GBJXbLYgGAhFno&code=011475&_resl=true"}, {"id": "3", "url": "https://store-info.skylark.co.jp/skylark/spot/detail?bc=eNqLVnq2ZM7zLYseN3c8bm5%2B3DxdSUcJiJ7uan66edvzOT1AdmJRaqJ%2BTmZxiX1xamJRckZMqYGBcUpiSkpRanExiGNkhsQxTjE2gQjGA4VyIEIlRaWpIFPnbHg6e%2FPjxnVPdqx9ur%2F5cWP346ZOiP1A2eKC%2FBKwPfq5iQUodiXn5xelYLXJ2ACHZbEAl2RaAQ%3D%3D&code=012840&_resl=true"}, {"id": "4", "url": "https://store-info.skylark.co.jp/skylark/spot/detail?bc=eNqLVnq2ZM7zLYseN3c8bm5%2B3DxdSUcJiJ7u2%2Fx087bnc3qA7MSi1ET9nMziEvvi1MSi5IyYUgMD45TElJSi1OJiEMfIDIljnGJsBhGMBwrlQIRKikpTQabO2fB09ubHjeue7Fj7dH%2Fz48bux02dEPuBssUF%2BSVge%2FRzEwtQ7ErOzy9KwWqTkYEhdstiAb9GWjU%3D&code=012861&_resl=true"}], "format": [{"regex_match": "Null", "regex_item": "Null", "xpath": "//*[@id=\"w_1_detail_1_1-spot-name\"]/text()", "remove_tag": "", "item_name": "name"}, {"regex_match": "Null", "regex_item": "Null", "xpath": "//*[@id=\"w_1_detail_1_1_2-tbody\"]/tr[1]/td[2]/span[1]/text()", "remove_tag": "", "item_name": "zip"}, {"regex_match": "Null", "regex_item": "Null", "xpath": "//*[@id=\"w_1_detail_1_1_2-beforeAddressSpace\"]/text()", "remove_tag": "", "item_name": "address"}, {"regex_match": "Null", "regex_item": "Null", "xpath": "//*[@id=\"w_1_detail_1_1_2-tbody\"]/tr[2]/td[2]/span/text()", "remove_tag": "", "item_name": "tel"}], "coordination": [{"regex_match": "http:\\/\\/maps\\.google\\.com\\/maps\\?q=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "tinymapurl_regex": "", "srid": "", "mapurl_regex": "", "lat": "$2", "mapurl_xpath": "", "xpath": "//*[@id=\"w_7_imagelink_1_1_7-label-upper\"]/span/a", "lon": "$1"}]} | |
#115_Saizeriya | |
input_data = {"format": [{"regex_match": "Null", "item_name": "name", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/h2/text()", "regex_item": "Null"}, {"regex_match": "Null", "item_name": "zip", "remove_tag": "", "xpath": "Null", "regex_item": "Null"}, {"regex_match": "Null", "item_name": "address", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[3]/td/text()", "regex_item": "Null"}, {"regex_match": "Null", "item_name": "tel", "remove_tag": "", "xpath": "//*[@id=\"shop_detail\"]/table/tbody/tr[4]/td/text()", "regex_item": "Null"}], "target": [{"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1362", "id": "1"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1363", "id": "2"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1358", "id": "3"}, {"url": "http://www.saizeriya.co.jp/restaurant/shop_detail.php?cd=1359", "id": "4"}], "coordination": [{"regex_match": "https:\\/\\/map\\.yahoo\\.co\\.jp\\/maps\\?.*\\;lat=([\\-\u00a5d\u00a5.]+)&.*\\;lon=([\\-\u00a5d\u00a5.]+)", "tinymapurl_regex": "", "srid": "", "lon": "$1", "mapurl_xpath": "", "lat": "$2", "mapurl_regex": "", "xpath": "//*[@id=\"yolp-logo-link\"]"}]} | |
#116_APITA | |
input_data = {"target": [{"id": "1", "url": "http://www.uny.co.jp/shop/207/index.html"}, {"id": "2", "url": "http://www.uny.co.jp/shop/105/index.html"}, {"id": "3", "url": "http://www.uny.co.jp/shop/135/index.html"}, {"id": "4", "url": "http://www.uny.co.jp/shop/150/index.html"}], "coordination": [{"xpath": "//*[@id=\"map\"]/div/div/div[2]/a", "mapurl_xpath": "//*[@id=\"accessMap\"]/a", "lat": "$2", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "tinymapurl_regex": "javascript:void\\(0\\);", "lon": "$1", "mapurl_regex": "\\/shop\\/[^\\/]+\\/access\\.html", "srid": ""}], "format": [{"item_name": "name", "xpath": "//*[@id=\"titleH\"]/h2/text()", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"item_name": "zip", "xpath": "Null", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"item_name": "address", "xpath": "//*[@id=\"titleC\"]/div[1]/table/tbody/tr[4]/td/text()", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}, {"item_name": "tel", "xpath": "//*[@id=\"titleC\"]/div[1]/table/tbody/tr[2]/td/text()", "regex_item": "Null", "regex_match": "Null", "remove_tag": ""}]} | |
#117_Ito-Yokado | |
input_data = {"target": [{"url": "http://blog.itoyokado.co.jp/shop/765/index.html", "id": "1"}, {"url": "http://blog.itoyokado.co.jp/shop/225/index.html", "id": "2"}, {"url": "http://blog.itoyokado.co.jp/shop/196/index.html", "id": "3"}, {"url": "http://blog.itoyokado.co.jp/shop/078/index.html", "id": "4"}], "coordination": [{"mapurl_xpath": "", "xpath": "//*[@id=\"shop_content_column\"]/div[5]/div/div[1]/p[1]/a", "lon": "$1", "lat": "$2", "regex_match": "\\wd([\\-\u00a5d\u00a5.]+)\\!\\wd([\\-\u00a5d\u00a5.]+)", "mapurl_regex": "", "tinymapurl_regex": "https:\\/\\/goo\\.gl\\/maps\\/\\w+", "srid": ""}], "format": [{"item_name": "name", "regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"shop_content_column\"]/div[1]/h2/text()", "regex_item": "Null"}, {"item_name": "zip", "regex_match": "\u3012([\\d\\-]+)\\s*", "remove_tag": "", "xpath": "//*[@id=\"shop_content_column\"]/div[1]/div[1]/p[1]", "regex_item": "$1 "}, {"item_name": "address", "regex_match": "\u3012[\\d\\-]+\\s*\"*\\s*<br>\\s*\"*\\s*([^<]+)\\s*<\"*", "remove_tag": "", "xpath": "//*[@id=\"shop_content_column\"]/div[1]/div[1]/p[1]", "regex_item": "$1 "}, {"item_name": "tel", "regex_match": "\u96fb\u8a71\u756a\u53f7\uff1a([\\d\\-]+)", "remove_tag": "", "xpath": "//*[@id=\"shop_content_column\"]/div[1]/div[1]", "regex_item": "$1 "}]} | |
#118_ToyokoInn | |
input_data = {"format": [{"regex_match": "Null", "item_name": "name", "regex_item": "Null", "remove_tag": "", "xpath": "//*[@id=\"mainArea\"]/div/h1/span/em/a/text()"}, {"regex_match": "\u3012([\\d\\-]+)", "item_name": "zip", "regex_item": "$1 ", "remove_tag": "", "xpath": "//*[@id=\"mainArea\"]/div[2]/div[1]/p"}, {"regex_match": "\u3012[\\d\\-]+\\s*(?: )*([^<]+)<", "item_name": "address", "regex_item": "$1 ", "remove_tag": "", "xpath": "//*[@id=\"mainArea\"]/div[2]/div[1]/p"}, {"regex_match": "Null", "item_name": "tel", "regex_item": "Null", "remove_tag": "", "xpath": "//*[@id=\"mainArea\"]/div[2]/div[1]/dl[1]/dd/text()"}], "coordination": [{"regex_match": "https:\u00a5/\u00a5/www\u00a5.google\u00a5.com\u00a5/maps\u00a5/place\u00a5/.*\\/*\u00a5@([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "mapurl_xpath": "", "srid": "", "mapurl_regex": "", "lat": "$1", "lon": "$2", "tinymapurl_regex": "", "xpath": "//*[@id=\"mainArea\"]/section[3]/div/div/ul/li/a"}], "target": [{"url": "http://www.toyoko-inn.com/hotel/00264/index.html", "id": "1"}, {"url": "http://www.toyoko-inn.com/hotel/00232/index.html", "id": "2"}, {"url": "http://www.toyoko-inn.com/hotel/00126/index.html", "id": "3"}, {"url": "http://www.toyoko-inn.com/hotel/00067/index.html", "id": "4"}]} | |
#119_Takashimaya | |
input_data = {"coordination": [{"regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "mapurl_regex": "\u00a5/[^\u00a5/]+\u00a5/access\u00a5/index.html", "srid": "", "xpath": "//*[@id=\"mainArea\"]/section[2]/div/table/tbody/tr[1]/td[1]/ul/li/a", "tinymapurl_regex": "", "mapurl_xpath": "//*[@id=\"storeInfo\"]/div/p/a", "lat": "$1", "lon": "$2"}], "target": [{"id": "1", "url": "https://www.takashimaya.co.jp/rakusai/index.html"}, {"id": "2", "url": "https://www.takashimaya.co.jp/okayama/index.html"}, {"id": "3", "url": "https://www.takashimaya.co.jp/tachikawa/index.html"}, {"id": "4", "url": "https://www.takashimaya.co.jp/yokohama/index.html"}], "format": [{"xpath": "//*[@id=\"topicPath\"]/li[2]/text()", "regex_match": "Null", "item_name": "name", "remove_tag": "", "regex_item": "Null"}, {"xpath": "//*[@id=\"header\"]/p", "regex_match": "\u3012([\\d\\-]+)\\s*", "item_name": "zip", "remove_tag": "", "regex_item": "$1 "}, {"xpath": "//*[@id=\"header\"]/p", "regex_match": "\u3012[\\d\\-]+\\s*(.+)\\s*TEL", "item_name": "address", "remove_tag": "", "regex_item": "$1 "}, {"xpath": "//*[@id=\"header\"]/p", "regex_match": "\u3012[\\d\\-]+\\s*.+\\s*TEL\\s*\\:*\\s*([\\d\\-\\(\\)\\s]+)", "item_name": "tel", "remove_tag": "", "regex_item": "$1 "}]} | |
#120_SonicDrive-In | |
input_data = { | |
"target": [ | |
{"id": "1", "url": "https://locations.sonicdrivein.com/id/emmett/650-highway-16.html"}, {"id": "2", "url": "https://locations.sonicdrivein.com/oh/nelsonville/1025-e--canal-street.html"}, {"id": "3", "url": "https://locations.sonicdrivein.com/nj/elizabeth/573-spring-street.html"}, {"id": "4", "url": "https://locations.sonicdrivein.com/sc/irmo/1150-dutch-fork-road.html"}], | |
"coordination": [ | |
{ | |
"lat": "$1", | |
"mapurl_xpath": '//*[@id="nap"]/div/div[2]/div[1]/a', | |
"srid": "", | |
"xpath": "", | |
"regex_match": "https:\u00a5/\u00a5/www\u00a5.google\u00a5.com\u00a5/maps\u00a5/place\u00a5/.+\u00a5@([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", | |
"lon": "$2", | |
"mapurl_regex": ""} | |
], "format": [{"regex_item": "Null", "item_name": "name", "xpath": "//*[@id=\"nap\"]/div/div[2]/div[1]/h1/span[2]/text()", "regex_match": "Null", "remove_tag": ""}, {"regex_item": "Null", "item_name": "zip", "xpath": "//*[@id=\"nap\"]/div/div[2]/div[2]/div[2]/div[1]/div[1]/address/span[3]/text()", "regex_match": "Null", "remove_tag": ""}, {"regex_item": "$1 $2, $3", "item_name": "address", "xpath": "//*[@id=\"nap\"]/div/div[2]/div[2]/div[2]/div[1]/div[1]/address", "regex_match": "class=\"c\\-address\\-street\\-1\">\\s*([^<]+)\\s*<.+itemprop=\\\"addressLocality\\\">\\s*([^<]+)\\s*<.+itemprop=\"addressRegion\">\\s*([^<]+)\\s*", "remove_tag": ""}, {"regex_item": "$1 ", "item_name": "tel", "xpath": "//*[@id=\"telephone\"]", "regex_match": ">\\s*\"*([\\d\\-\\(\\)\\s]+)\"*\\s*<", "remove_tag": ""}]} | |
# #121_Arby's | |
# input_data = {"coordination": [], "target": [{"id": "1", "url": "http://locations.arbys.com/us/wi/stoughton/900-nygaard-street.html"}, {"id": "2", "url": "http://locations.arbys.com/us/nm/raton/415-clayton-hwy.html"}, {"id": "3", "url": "http://locations.arbys.com/us/ks/mission/6780-johnson-dr.html"}, {"id": "4", "url": "http://locations.arbys.com/us/al/eufaula/815-south-eufaula-ave.html"}], "format": [{"item_name": "name", "regex_item": "$1", "xpath": "//*[@id=\"location-name\"]", "remove_tag": "", "regex_match": ">([^<]+)<"}, {"item_name": "zip", "regex_item": "$1 ", "xpath": "//*[@id=\"address\"]/span[3]", "remove_tag": "", "regex_match": "\\s*(\\d+)"}, {"item_name": "address", "regex_item": "$1 $2, $3", "xpath": "//*[@id=\"address\"]", "remove_tag": "", "regex_match": "class=\"c-address-street-1\"\\s*>\\s*([^<]+)\\s*<.+itemprop=\"addressLocality\"\\s*>\\s*([^<]+)\\s*<.+itemprop=\"addressRegion\"\\s*>\\s*([^<]+)"}, {"item_name": "tel", "regex_item": "$1", "xpath": "//*[@id=\"telephone\"]", "remove_tag": "", "regex_match": ">([\\(\\w\\)\\s\\-]+)<"}]} | |
# #122_DUNNBROTHERSCOFFEE | |
# input_data = {"coordination": [{"mapurl_regex": "", "srid": "", "mapurl_xpath": "", "regex_match": "http:\\/\\/api\\.tiles\\.mapbox\\.com\\/v4\\/yext\\..+\\(([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)\\)\\/", "xpath": "//*[@id=\"schema-location\"]", "lon": "$1", "lat": "$2", "tinymapurl_regex": ""}], "format": [{"xpath": "//*[@id=\"location-name\"]/span[2]/text()", "item_name": "name", "regex_match": "Null", "remove_tag": "", "regex_item": "Null"}, {"xpath": "//*[@id=\"address\"]/span[3]/text()", "item_name": "zip", "regex_match": "Null", "remove_tag": "", "regex_item": "Null"}, {"xpath": "//*[@id=\"address\"]", "item_name": "address", "regex_match": "<span class=\"c\\-address\\-street\\-1\"[^>]*>([^<]+)<.+itemprop=\"addressLocality\"[^>]*>\\s*([^<]+)\\s*<.+itemprop=\"addressRegion\"[^>]*>\\s*(\\w{2})\\s*", "remove_tag": "", "regex_item": "$1 $2"}, {"xpath": "//*[@id=\"main\"]/div/header/div/div[3]/section/div[2]/div/div[2]/div/a/text()", "item_name": "tel", "regex_match": "Null", "remove_tag": "", "regex_item": "Null"}], "target": [{"url": "https://locations.dunnbrothers.com/tn/nashville/401-church-st.html", "id": "1"}, {"url": "https://locations.dunnbrothers.com/mn/minneapolis/601-marquette-ave.html", "id": "2"}, {"url": "https://locations.dunnbrothers.com/mn/hastings/919-vermillion-street.html", "id": "3"}, {"url": "https://locations.dunnbrothers.com/mn/apple-valley/15265-galaxie-ave.html", "id": "4"}]} | |
#123_StumptownCoffeeRoasters | |
input_data = {"format": [{"regex_item": "Null", "remove_tag": "", "regex_match": "Null", "xpath": "//*[@id=\"main\"]/div[1]/div[2]/div[2]/div[1]/span/text()", "item_name": "name"}, {"regex_item": "$1 ", "remove_tag": "", "regex_match": "\\w{2}\\s([\\d]{5})", "xpath": "//*[@id=\"main\"]/div[1]/div[2]/div[2]/div[1]/div/a/h3", "item_name": "zip"}, {"regex_item": "$1 $2 $3", "remove_tag": "", "regex_match": ">\\s*(.+)\\s*<br>\\s*([^,]+)\\s*,\\s*(\\w{2})", "xpath": "//*[@id=\"main\"]/div[1]/div[2]/div[2]/div[1]/div/a/h3", "item_name": "address"}, {"regex_item": "Null", "remove_tag": "", "regex_match": "Null", "xpath": "Null", "item_name": "tel"}], | |
"coordination": [{"srid": "", "mapurl_regex": "", "mapurl_xpath": "", "regex_match": "https:\u00a5/\u00a5/www\u00a5.google\u00a5.co\u00a5.jp\u00a5/maps\u00a5/place\u00a5/.+\\/\u00a5@([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "xpath": "//*[@id=\"main\"]/div[1]/div[2]/div[2]/div[1]/div", "lat": "$1", "tinymapurl_regex": "https:\\/\\/maps\\.google\\.com\\/\\?q=\\.\\+", "lon": "$2"}], "target": [{"id": "1", "url": "https://www.stumptowncoffee.com/locations/portland/ace-pdx"}, {"id": "2", "url": "https://www.stumptowncoffee.com/locations/portland/annex"}, {"id": "3", "url": "https://www.stumptowncoffee.com/locations/seattle/pine"}, {"id": "4", "url": "https://www.stumptowncoffee.com/locations/newyork/ace-nyc"}]} | |
#124_OUTBACKSTEAKHOUSE | |
# input_data = {"target": [{"url": "https://www.outback.com/locations/ar/conway", "id": "1"}, {"url": "https://www.outback.com/locations/ca/arcadia", "id": "2"}, {"url": "https://www.outback.com/locations/ky/bowling-green", "id": "3"}, {"url": "https://www.outback.com/locations/ny/buffalo-amherst", "id": "4"}], "format": [{"remove_tag": "", "item_name": "name", "regex_match": "Null", "xpath": "//*[@id=\"LocationDetailHeader\"]/div/div/div/div/div/div/h1/span/text()", "regex_item": "Null"}, {"remove_tag": "", "item_name": "zip", "regex_match": "Null", "xpath": "//*[@id=\"LocationDetailHeader\"]/div/div/div/div/div/ul/li[1]/span/p[1]/a[1]/span[4]/text()", "regex_item": "Null"}, {"remove_tag": "", "item_name": "address", "regex_match": "ng-bind=\"CurrentLocation.Address\"[^>]+>([^<]+)<[\\s\\S]+ng-bind=\"CurrentLocation.City\"[^>]+>([^<]+)<[\\s\\S]+ng-bind=\"CurrentLocation.State\"[^>]+>(\\w+)<", "xpath": "//*[@id=\"LocationDetailHeader\"]/div/div/div/div/div/ul/li[1]/span/p[1]/a[1]", "regex_item": "$1 $2, $3"}, {"remove_tag": "", "item_name": "tel", "regex_match": "Null", "xpath": "//*[@id=\"LocationDetailHeader\"]/div/div/div/div/div/ul/li[1]/span/p[1]/span/text()", "regex_item": "Null"}], "coordination": [{"lat": "$2", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "xpath": "//*[@id=\"LocationDetailMap\"]/div/div/div[2]/a", "lon": "$1", "srid": "", "tinymapurl_regex": "", "mapurl_xpath": "", "mapurl_regex": ""}]} | |
# #125_REDLOBSTER | |
# input_data = {"target": [{"url": "https://www.redlobster.com/locations/list/mt/billings/2250-king-ave-w/", "id": "1"}, {"url": "https://www.redlobster.com/locations/list/nv/las-vegas/200-s-decatur-blvd/", "id": "2"}, {"url": "https://www.redlobster.com/locations/list/de/talleyville/309-rocky-run-parkway/", "id": "3"}, {"url": "https://www.redlobster.com/locations/list/sd/rapid-city/120-disk-drive/", "id": "4"}], "format": [{"regex_match": "<br>\\s*([^,]+),\\s*(\\w{2})", "remove_tag": "", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[1]/h1", "item_name": "name", "regex_item": "$1 $2"}, {"regex_match": ",\\s\\w{2}\\s(\\d+)", "remove_tag": "", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[1]/p", "item_name": "zip", "regex_item": "$1 "}, {"regex_match": ">\\s*([^<]+)<br>\\s*([^,]+),\\s(\\w{2})", "remove_tag": "", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[1]/p", "item_name": "address", "regex_item": "$1 $2, $3"}, {"regex_match": "Phone:[\\s\\S]([\\(\\)\\s\\d\\-]+\\d)\\s", "remove_tag": "", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[2]/a", "item_name": "tel", "regex_item": "$1 "}], "coordination": [{"lon": "$2", "mapurl_xpath": "", "tinymapurl_regex": "", "mapurl_regex": "", "regex_match": "https:\\/\\/www\\.google\\.com\\/maps\\/dir\\/Current\\+Location\\/([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "srid": "", "lat": "$1", "xpath": "//*[@id=\"mainContent\"]/div[1]/div[1]/div[1]/a[1]"}]} | |
#126_WHOLEFOODSMARKET | |
input_data = { | |
"format": [ | |
{"regex_match": "<h1 class=\"store-title\">([^<]+)<\\/h1>", "xpath": "//*[@id=\"content-top\"]", "regex_item": "$1 ", "remove_tag": "", "item_name": "name"}, {"regex_match": "class=\"postal-code\">([\\d\\-]+)<", "xpath": "//*[@id=\"content-top\"]", "regex_item": "$1 ", "remove_tag": "", "item_name": "zip"}, {"regex_match": "<div class=\"thoroughfare\">([^\\<]+)<\\/div>[\\s\\S]+<span class=\"locality\">([^<]+)<[^>]+>,\\s*[^>]+.(\\w{2})<", "xpath": "//*[@id=\"content-top\"]", "regex_item": "$1 $2, $3", "remove_tag": "", "item_name": "address"}, {"regex_match": ">\\s*P: (\\d+[\\.\\-\\s]\\d+[\\.\\-\\s]\\d+)\\s*<br>", "xpath": "//*[@id=\"content-top\"]/div", "regex_item": "$1 ", "remove_tag": "", "item_name": "tel"}], | |
"coordination": [ | |
{"srid": "", "tinymapurl_regex": "", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "lon": "$1", "xpath": "//*[@id=\"block-store-store-location-map\"]/div/div/div/div[1]/div/div/div[2]/a", "mapurl_regex": "\\/store\\-locations\\/\\#store\\/\\d+", | |
"lat": "$2", "mapurl_xpath": '//*[@id="block-views-9921a8226dcbc528f004a3704cd5a7ff"]/div/div/div/div/div[2]/div[5]/div/a[1]'}], "target": [{"url": "http://www.wholefoodsmarket.com/stores/huntsville", "id": "1"}, {"url": "http://www.wholefoodsmarket.com/stores/chandler", "id": "2"}, {"url": "http://www.wholefoodsmarket.com/stores/scottsdale", "id": "3"}, {"url": "http://www.wholefoodsmarket.com/stores/sedona", "id": "4"}, | |
{"url": "http://www.wholefoodsmarket.com/stores/2001marketstreet", "id": "5"}]} | |
#127_Wegmans | |
input_data = {"target": [{"id": "1", "url": "https://www.wegmans.com/stores/jamestown-ny.html"}, {"id": "2", "url": "https://www.wegmans.com/stores/john-glenn-ny.html"}, {"id": "3", "url": "https://www.wegmans.com/stores/johnson-city-ny.html"}, {"id": "4", "url": "https://www.wegmans.com/stores/auburn-ny.html"}], "coordination": [{"regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "mapurl_regex": "", "mapurl_xpath": "", "tinymapurl_regex": "", "lat": "$1", "xpath": "//*[@id=\"storeDetailsMap\"]/div/div/div[2]/a", "srid": "", "lon": "$2"}], "format": [{"regex_match": "Null", "item_name": "name", "regex_item": "Null", "remove_tag": "", "xpath": "/html/body/div[1]/div[2]/div/div[1]/div/div[1]/h1/text()"}, {"regex_match": "<br>\\s*[^,]+,\\s\\w{2}\\s(\\d+)", "item_name": "zip", "regex_item": "$1 ", "remove_tag": "", "xpath": "/html/body/div[1]/div[2]/div/div[1]/div/div[2]/div/address/span"}, {"regex_match": ">([\\w\\s]+)\\w[\\S\\s]*<br>\\s*([^,]+),\\s*(\\w{2})", "item_name": "address", "regex_item": "$1 $2, $3", "remove_tag": "", "xpath": "/html/body/div[1]/div[2]/div/div[1]/div/div[2]/div/address/span"}, {"regex_match": "Null", "item_name": "tel", "regex_item": "Null", "remove_tag": "", "xpath": "/html/body/div[1]/div[2]/div/div[1]/div/div[2]/div/div/span/text()"}]} | |
#128_OMNIHOTELS&RESORTS | |
input_data = {"target": [{"url": "https://www.omnihotels.com/hotels/san-francisco", "id": "1"}, {"url": "https://www.omnihotels.com/hotels/dallas-park-west", "id": "2"}, {"url": "https://www.omnihotels.com/hotels/frisco", "id": "3"}, {"url": "https://www.omnihotels.com/hotels/richmond", "id": "4"}], "format": [{"regex_match": "^[\\s\\t]*(.+)[\\s\\t]*$", "item_name": "name", "remove_tag": "1", "xpath": "//*[@id=\"hero-image-container\"]/div[2]/div/div/div[1]/div[1]/h1", "regex_item": "$1"}, {"regex_match": "^[\\s\\t]*(.+)[\\s\\t]*$", "item_name": "zip", "remove_tag": "1", "xpath": "//*[@id=\"maincontent\"]/div/div/div/div[2]/div[1]/span[4]", "regex_item": "$1"}, {"regex_match": "^[\\s\\t]*(.+)[\\s\\t]*$", "item_name": "address", "remove_tag": "2", "xpath": "//*[@id=\"maincontent\"]/div/div/div/div[2]/div[1]", "regex_item": "$1"}, {"regex_match": "^[\\s\\t]*(.+)[\\s\\t]*$", "item_name": "tel", "remove_tag": "1", "xpath": "//*[@id=\"maincontent\"]/div/div/div/div[2]/div[2]/span[2]/span", "regex_item": "$1"}], "coordination": [{"regex_match": "https:\\/\\/maps\\.google\\.com\\/maps\\?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "srid": "", "lat": "$1", "tinymapurl_regex": "", "mapurl_xpath": "//*[@id=\"maincontent\"]/div/div/div/div[2]/div[3]/a", "xpath": "//*[@id=\"maincontainer_0_contentsection_1_genericrightcolumn_1_pnlContent\"]/script[2]", "mapurl_regex": "\\/hotels\\/[^\\/]+\\/[^\\/]+\\/directions", "lon": "$2"}]} | |
#129_bloomingale's | |
input_data = { | |
"coordination": [ | |
{"regex_match": "itemprop=\"latitude\" content=([\\-\u00a5d\u00a5.]+)\"\"\\S+itemprop=\"\"longitude\"\" content=\"([\\-\u00a5d\u00a5.]+)\"\" \"", | |
"mapurl_xpath": "", | |
"srid": "", | |
"lon": "$2", | |
"mapurl_regex": "", | |
"xpath": "//*[@id=\"main\"]/div/div[8]/div[2]/span", | |
"lat": "$1", "tinymapurl_regex": ""} | |
], "target": [{"id": "1", "url": "http://locations.bloomingdales.com/stanford"}, {"id": "2", "url": "http://locations.bloomingdales.com/glendale-galleria"}, {"id": "3", "url": "http://locations.bloomingdales.com/south-coast-plaza"}, {"id": "4", "url": "http://locations.bloomingdales.com/the-colonnade-outlet-at-sawgrass-mills"}], "format": [{"regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"location-name\"]", "regex_item": "Null", "item_name": "name"}, {"regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"address\"]/span[4]", "regex_item": "Null", "item_name": "zip"}, {"regex_match": "Null", "remove_tag": "1", "xpath": "//*[@id=\"address\"]", "regex_item": "Null", "item_name": "address"}, {"regex_match": "Null", "remove_tag": "", "xpath": "//*[@id=\"telephone\"]", "regex_item": "Null", "item_name": "tel"}]} | |
#130_Kochlffel | |
input_data = {"format": [{"item_name": "name", "regex_item": "Null", "xpath": "//*[@id=\"restaurants\"]/div[1]/div[1]/div/h2/text()", "regex_match": "Null", "remove_tag": ""}, {"item_name": "zip", "regex_item": "$1 ", "xpath": "//*[@id=\"restaurants\"]/div[1]/div[1]/div", "regex_match": "<strong>Adresse:<\\/strong><br>[\\s\\S][^,]+,\\s*(\\d+)", "remove_tag": ""}, {"item_name": "address", "regex_item": "$1 ", "xpath": "//*[@id=\"restaurants\"]/div[1]/div[1]/div", "regex_match": "<strong>Adresse:<\\/strong><br>[\\s\\S]\\s*([^<]+)<br>", "remove_tag": ""}, {"item_name": "tel", "regex_item": "$1 ", "xpath": "//*[@id=\"restaurants\"]/div[1]/div[1]/div", "regex_match": "Telefon<\\/strong><br>[\\s\\S]+\\s([\\/\\d]+)<br>", "remove_tag": ""}], "coordination": [{"xpath": "//*[@id=\"google-map\"]/div/div/div[2]/a", "lat": "$2", "regex_match": "https:\u00a5/\u00a5/maps\u00a5.google\u00a5.com\u00a5/maps\u00a5?.*ll=([\\-\u00a5d\u00a5.]+),([\\-\u00a5d\u00a5.]+)", "mapurl_xpath": "", "tinymapurl_regex": "", "lon": "$1", "mapurl_regex": "", "srid": ""}], "target": [{"id": "1", "url": "http://www.kochloeffel.de/filialfinder/detail/?store=4"}, {"id": "2", "url": "http://www.kochloeffel.de/filialfinder/detail/?store=6"}, {"id": "3", "url": "http://www.kochloeffel.de/filialfinder/detail/?store=36"}, {"id": "4", "url": "http://www.kochloeffel.de/filialfinder/detail/?store=81"}]} | |
apiurl = "http://138.68.241.86:9080/crawl.json?spider_name=shop_info&url=http://www.dmoz.org/Computers/Programming/Languages/Ada&start_requests=true" | |
input_data = json.dumps(input_data) | |
input_data_values = urllib.parse.urlencode({ "input_data" : input_data }) | |
full_url = apiurl + '&' + input_data_values | |
with urllib.request.urlopen(full_url) as response: | |
html = response.read() | |
print (html) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment