Skip to content

Instantly share code, notes, and snippets.

@lilacs2039
Created July 28, 2018 01:28
Show Gist options
  • Save lilacs2039/6a742dd136d5bf04c616be78d76617ca to your computer and use it in GitHub Desktop.
Save lilacs2039/6a742dd136d5bf04c616be78d76617ca to your computer and use it in GitHub Desktop.
マルチスレッドでリンクをたどりながら画像を収集
# coding:utf-8
"""
マルチスレッドでリンクをたどりながら画像を収集
Usage:
python scrapingImage.py --target_urls (対象URL) --save_dir images
↓を参考に加筆したもの
http://www.mathgram.xyz/entry/scraping/matome
"""
import multiprocessing
import os
import urllib
from bs4 import BeautifulSoup
import requests
import argparse
import os
import json
import glob
import random
import collections
import math
import time
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
from urllib import request
import random
import re
from multiprocessing import Pool
from urllib.parse import urlparse
import traceback
import sys
import io
# sys.setdefaultencoding('utf-8')
#-----------------------------------------------------------
max_urlProcess = 4
max_imgProcess_perUrl = 10
#-----------------------------------------------------------
parser = argparse.ArgumentParser()
parser.add_argument("--target_urls", nargs='*', required=True, help="url to scrape.")
parser.add_argument("--save_dir", default="./scraped_images", help="path to save folder")
parser.add_argument("--width_px_threashold",default=400, help="min pix to save. both w,h should upper than threashold.")
parser.add_argument("--height_px_threashold",default=None, help="min pix to save")
parser.add_argument("--banner_ratio",default=4, help="skip image as banner, if image ratio upper than this")
a = parser.parse_args()
if a.height_px_threashold == None:
a.height_px_threashold = a.width_px_threashold
finishedLinks = []
finishedImgs = []
# consumers 処理を開始する
# ref:http://ja.pymotw.com/2/multiprocessing/communication.html
class Consumer(multiprocessing.Process):
def __init__(self, task_queue, result_queue):
multiprocessing.Process.__init__(self)
self.task_queue = task_queue
self.result_queue = result_queue
def run(self):
proc_name = self.name
while True:
next_task = self.task_queue.get()
if next_task is None:
# Poison pill は終了を意味します
print
'%s: Exiting' % proc_name
self.task_queue.task_done()
break
print
'%s: %s' % (proc_name, next_task)
answer = next_task()
self.task_queue.task_done()
self.result_queue.put(answer)
return
results = multiprocessing.Queue()
tasks = multiprocessing.JoinableQueue()
consumers = [Consumer(tasks, results)
for i in range(max_urlProcess)]
for w in consumers:
w.start()
def save_image(imgurl, timeout=10):
try:
time.sleep(random.randint(1, 3) * 0.1) # 100~300ms wait
response = requests.get(imgurl, allow_redirects=False, timeout=timeout)
if response.status_code != 200:
e = Exception("HTTP status: " + str(response.status_code))
raise e
content_type = response.headers["content-type"]
if 'image' not in content_type:
e = Exception("Content-Type: " + content_type)
raise e
imageBytes = response.content
image = Image.open(io.BytesIO(imageBytes))
width,height = image.size
if width < a.width_px_threashold or height < a.height_px_threashold:
print("skiped small image : "+imgurl)
return
if width/height > a.banner_ratio or height/width > a.banner_ratio:
print("skiped banner image : "+imgurl)
return
ext=""
if "jpeg" in content_type or "jpg" in content_type:
ext="jpg"
elif "png" in content_type:
ext="png"
elif "gif" in content_type:
ext="gif"
# replace abandoned str for filename from url
saveName = re.sub(r'[\\|/|:|?|.|"|<|>|\|]', '_', imgurl)+"."+ext
with open(os.path.join(a.save_dir, saveName), "wb") as fout:
fout.write(imageBytes)
print("saved image : " + saveName)
except:
print("FAIL to save image : " + imgurl)
import traceback
traceback.print_exc()
def geturl(targetUrl):
"""
analize a tag, save images, jump to next link
:param targetUrl:
:return:
"""
try:
targetUrl = replaceJapaneseStr(targetUrl)
# targetUrl = urllib.parse.quote(targetUrl,safe="/:=?&")
print("START parse url : " + targetUrl)
domain = _parse_domain(targetUrl)
# print("Page: {}".format(i))
# リンク先(一覧から一回飛んだページ)のBodyを取得する
response = urllib.request.urlopen(targetUrl) #str string -> unicode string
body = response.read()
soup = BeautifulSoup(body, "lxml")
# parse a tag
links=list([])
for x in soup.find_all('a'):
try:
links.append(x["href"])
except:
pass
# only same domain and not access
# links = list(filter((lambda x: x.count(domain) and not x in finishedLinks), links)) # true:include
links = list(filter((lambda x: _parse_domain(x) == domain and not x in finishedLinks), links)) # true:include
finishedLinks.extend(links)
# parse img tag
imgUrls = list([])
for x in soup.find_all('img'):
try:
imgUrls.append(x["src"])
except:
pass
# only not accessed img
imgUrls = list(filter((lambda x: not x in finishedImgs), imgUrls)) # true:include
finishedImgs.extend(imgUrls)
# save image -------------------------------------------------------------------
p = Pool(max_imgProcess_perUrl)
output = p.map(save_image, imgUrls)
p.close() # プロセスの終了
p.join() # wait until worker processes finish
# goto next url -------------------------------------------------------------------
# ジョブをキューへ入れる
# num_jobs = 10
for l in links:
# tasks.put(Task(l))
geturl(l)
time.sleep(2)
print("FINISH to parse url : " + targetUrl)
except:
print("FAIL to parse url : " + targetUrl)
traceback.print_exc()
from urllib.parse import urlparse
import urllib.request
def _parse_domain(targetUrl):
parsed_uri = urlparse(targetUrl)
return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
def replaceJapaneseStr(japaneseUrl):
p = urlparse(japaneseUrl)
query = urllib.parse.quote_plus(p.query, safe='=&')
japaneseUrl = '{}://{}{}{}{}{}{}{}{}'.format(
p.scheme, p.netloc, p.path,
';' if p.params else '', p.params,
'?' if p.query else '', query,
'#' if p.fragment else '', p.fragment)
return japaneseUrl
def main():
if not os.path.exists(a.save_dir):
os.mkdir(a.save_dir)
for url in a.target_urls:
geturl(url)
if __name__ == "__main__":
main()
# 各 consumer へ poison pill を追加する
# for i in range(max_urlProcess):
# tasks.put(None)
# 全てのタスクの終了を待つ
tasks.join()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment