Last active
October 13, 2017 09:34
-
-
Save takahub1/4c2985e389057d0de259c29a03f8e859 to your computer and use it in GitHub Desktop.
python3 "url" "filename"
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#-*- coding:utf-8 -*- | |
import os | |
import sys | |
import bs4 | |
import urllib.request | |
def crawring(url, filename): | |
# 指定したURLのHTMLを取得 | |
html = get_html_string(url) | |
if len(html) < 1: | |
print("HTMLが取得できませんでした。") | |
print("URLを確認してください。") | |
sys.exit(1) | |
soup = bs4.BeautifulSoup(html, "lxml") | |
book_title = soup.title.string | |
main_body = soup.find("div", {"id": "B"}) | |
f = open(filename, 'w') | |
for a_tag in main_body.find_all("a"): | |
href_str = a_tag.get("href") | |
if href_str is not None: | |
print("http://mangamura.org/new_pc_view" + href_str[1:]) | |
f.write("http://mangamura.org/new_pc_view" + href_str[1:] + "\r\n") | |
f.close() | |
def get_html_string(url): | |
headers = { | |
"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0", | |
} | |
request = urllib.request.Request(url=url, headers=headers) | |
response = urllib.request.urlopen(request) | |
decoded_html = response.read().decode('utf-8') | |
return decoded_html | |
def main(): | |
crawring(sys.argv[1], sys.argv[2]) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment