Last active
October 13, 2017 09:34
-
-
Save takahub1/7006803dfd1a3641a10b440127be75d9 to your computer and use it in GitHub Desktop.
python3 getImage.py "filename"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
import os | |
import sys | |
import time | |
import bs4 | |
import urllib.request | |
def crawring(url): | |
# 指定したURLのHTMLを取得 | |
html = get_html_string(url) | |
if len(html) < 1: | |
print("HTMLが取得できませんでした。") | |
print("URLを確認してください。") | |
sys.exit(1) | |
soup = bs4.BeautifulSoup(html, "lxml") | |
book_title = soup.title.string | |
book_title = book_title.replace(" ", "_") | |
os.system('mkdir ' + book_title) | |
for a_tag in soup.find_all("div"): | |
href_str = a_tag.get("data-background-image") | |
if href_str is not None: | |
print(href_str) | |
os.system("wget -q -P ./" + book_title + " " + href_str) | |
os.system('zip -r ' + book_title + " " + book_title) | |
os.system("rm -r "+ book_title) | |
def get_html_string(url): | |
decoded_html = "" | |
headers = { | |
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0", | |
} | |
request = urllib.request.Request(url=url, headers=headers) | |
response = urllib.request.urlopen(request) | |
decoded_html = response.read().decode('utf-8') | |
return decoded_html | |
def main(): | |
# 引数確認 | |
if len(sys.argv) != 2: | |
sys.exit(1) | |
f = open(sys.argv[1]) | |
lines = f.readlines() | |
f.close() | |
for url in lines: | |
crawring(url) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment