Created
July 28, 2018 01:28
-
-
Save lilacs2039/6a742dd136d5bf04c616be78d76617ca to your computer and use it in GitHub Desktop.
マルチスレッドでリンクをたどりながら画像を収集
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding:utf-8 | |
""" | |
マルチスレッドでリンクをたどりながら画像を収集 | |
Usage: | |
python scrapingImage.py --target_urls (対象URL) --save_dir images | |
↓を参考に加筆したもの | |
http://www.mathgram.xyz/entry/scraping/matome | |
""" | |
import multiprocessing | |
import os | |
import urllib | |
from bs4 import BeautifulSoup | |
import requests | |
import argparse | |
import os | |
import json | |
import glob | |
import random | |
import collections | |
import math | |
import time | |
from PIL import Image | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from urllib import request | |
import random | |
import re | |
from multiprocessing import Pool | |
from urllib.parse import urlparse | |
import traceback | |
import sys | |
import io | |
# sys.setdefaultencoding('utf-8') | |
#----------------------------------------------------------- | |
max_urlProcess = 4 | |
max_imgProcess_perUrl = 10 | |
#----------------------------------------------------------- | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--target_urls", nargs='*', required=True, help="url to scrape.") | |
parser.add_argument("--save_dir", default="./scraped_images", help="path to save folder") | |
parser.add_argument("--width_px_threashold",default=400, help="min pix to save. both w,h should upper than threashold.") | |
parser.add_argument("--height_px_threashold",default=None, help="min pix to save") | |
parser.add_argument("--banner_ratio",default=4, help="skip image as banner, if image ratio upper than this") | |
a = parser.parse_args() | |
if a.height_px_threashold == None: | |
a.height_px_threashold = a.width_px_threashold | |
finishedLinks = [] | |
finishedImgs = [] | |
# consumers 処理を開始する | |
# ref:http://ja.pymotw.com/2/multiprocessing/communication.html | |
class Consumer(multiprocessing.Process): | |
def __init__(self, task_queue, result_queue): | |
multiprocessing.Process.__init__(self) | |
self.task_queue = task_queue | |
self.result_queue = result_queue | |
def run(self): | |
proc_name = self.name | |
while True: | |
next_task = self.task_queue.get() | |
if next_task is None: | |
# Poison pill は終了を意味します | |
'%s: Exiting' % proc_name | |
self.task_queue.task_done() | |
break | |
'%s: %s' % (proc_name, next_task) | |
answer = next_task() | |
self.task_queue.task_done() | |
self.result_queue.put(answer) | |
return | |
results = multiprocessing.Queue() | |
tasks = multiprocessing.JoinableQueue() | |
consumers = [Consumer(tasks, results) | |
for i in range(max_urlProcess)] | |
for w in consumers: | |
w.start() | |
def save_image(imgurl, timeout=10): | |
try: | |
time.sleep(random.randint(1, 3) * 0.1) # 100~300ms wait | |
response = requests.get(imgurl, allow_redirects=False, timeout=timeout) | |
if response.status_code != 200: | |
e = Exception("HTTP status: " + str(response.status_code)) | |
raise e | |
content_type = response.headers["content-type"] | |
if 'image' not in content_type: | |
e = Exception("Content-Type: " + content_type) | |
raise e | |
imageBytes = response.content | |
image = Image.open(io.BytesIO(imageBytes)) | |
width,height = image.size | |
if width < a.width_px_threashold or height < a.height_px_threashold: | |
print("skiped small image : "+imgurl) | |
return | |
if width/height > a.banner_ratio or height/width > a.banner_ratio: | |
print("skiped banner image : "+imgurl) | |
return | |
ext="" | |
if "jpeg" in content_type or "jpg" in content_type: | |
ext="jpg" | |
elif "png" in content_type: | |
ext="png" | |
elif "gif" in content_type: | |
ext="gif" | |
# replace abandoned str for filename from url | |
saveName = re.sub(r'[\\|/|:|?|.|"|<|>|\|]', '_', imgurl)+"."+ext | |
with open(os.path.join(a.save_dir, saveName), "wb") as fout: | |
fout.write(imageBytes) | |
print("saved image : " + saveName) | |
except: | |
print("FAIL to save image : " + imgurl) | |
import traceback | |
traceback.print_exc() | |
def geturl(targetUrl): | |
""" | |
analize a tag, save images, jump to next link | |
:param targetUrl: | |
:return: | |
""" | |
try: | |
targetUrl = replaceJapaneseStr(targetUrl) | |
# targetUrl = urllib.parse.quote(targetUrl,safe="/:=?&") | |
print("START parse url : " + targetUrl) | |
domain = _parse_domain(targetUrl) | |
# print("Page: {}".format(i)) | |
# リンク先(一覧から一回飛んだページ)のBodyを取得する | |
response = urllib.request.urlopen(targetUrl) #str string -> unicode string | |
body = response.read() | |
soup = BeautifulSoup(body, "lxml") | |
# parse a tag | |
links=list([]) | |
for x in soup.find_all('a'): | |
try: | |
links.append(x["href"]) | |
except: | |
pass | |
# only same domain and not access | |
# links = list(filter((lambda x: x.count(domain) and not x in finishedLinks), links)) # true:include | |
links = list(filter((lambda x: _parse_domain(x) == domain and not x in finishedLinks), links)) # true:include | |
finishedLinks.extend(links) | |
# parse img tag | |
imgUrls = list([]) | |
for x in soup.find_all('img'): | |
try: | |
imgUrls.append(x["src"]) | |
except: | |
pass | |
# only not accessed img | |
imgUrls = list(filter((lambda x: not x in finishedImgs), imgUrls)) # true:include | |
finishedImgs.extend(imgUrls) | |
# save image ------------------------------------------------------------------- | |
p = Pool(max_imgProcess_perUrl) | |
output = p.map(save_image, imgUrls) | |
p.close() # プロセスの終了 | |
p.join() # wait until worker processes finish | |
# goto next url ------------------------------------------------------------------- | |
# ジョブをキューへ入れる | |
# num_jobs = 10 | |
for l in links: | |
# tasks.put(Task(l)) | |
geturl(l) | |
time.sleep(2) | |
print("FINISH to parse url : " + targetUrl) | |
except: | |
print("FAIL to parse url : " + targetUrl) | |
traceback.print_exc() | |
from urllib.parse import urlparse | |
import urllib.request | |
def _parse_domain(targetUrl): | |
parsed_uri = urlparse(targetUrl) | |
return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) | |
def replaceJapaneseStr(japaneseUrl): | |
p = urlparse(japaneseUrl) | |
query = urllib.parse.quote_plus(p.query, safe='=&') | |
japaneseUrl = '{}://{}{}{}{}{}{}{}{}'.format( | |
p.scheme, p.netloc, p.path, | |
';' if p.params else '', p.params, | |
'?' if p.query else '', query, | |
'#' if p.fragment else '', p.fragment) | |
return japaneseUrl | |
def main(): | |
if not os.path.exists(a.save_dir): | |
os.mkdir(a.save_dir) | |
for url in a.target_urls: | |
geturl(url) | |
if __name__ == "__main__": | |
main() | |
# 各 consumer へ poison pill を追加する | |
# for i in range(max_urlProcess): | |
# tasks.put(None) | |
# 全てのタスクの終了を待つ | |
tasks.join() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment