takahub1 · October 13, 2017 09:34
diff --git a/getURLlist.py b/getURLlist.py
 #-*- coding:utf-8 -*-

 import os
 import sys
 import bs4
 import urllib.request

 def crawring(url, filename):
 	# 指定したURLのHTMLを取得
 	html = get_html_string(url)
 	if len(html) < 1:
 		print("HTMLが取得できませんでした。")
 		print("URLを確認してください。")
 		sys.exit(1)

 	soup = bs4.BeautifulSoup(html, "lxml")
 	book_title = soup.title.string
 	main_body = soup.find("div", {"id": "B"})
 	f = open(filename, 'w')
 	for a_tag in main_body.find_all("a"):
 		href_str = a_tag.get("href")
 		if href_str is not None:
 			print("http://mangamura.org/new_pc_view" + href_str[1:])
 			f.write("http://mangamura.org/new_pc_view" + href_str[1:] + "\r\n")
 	f.close()

 def get_html_string(url):
 	headers = {
 		"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
        }

 	request = urllib.request.Request(url=url, headers=headers)
 	response = urllib.request.urlopen(request)
 	decoded_html = response.read().decode('utf-8')

 	return decoded_html

 def main():
 	crawring(sys.argv[1], sys.argv[2])
 	
 if __name__ == "__main__":
 	main()
	#-- coding:utf-8 --

	import os
	import sys
	import bs4
	import urllib.request

	def crawring(url, filename):
	# 指定したURLのHTMLを取得
	html = get_html_string(url)
	if len(html) < 1:
	print("HTMLが取得できませんでした。")
	print("URLを確認してください。")
	sys.exit(1)

	soup = bs4.BeautifulSoup(html, "lxml")
	book_title = soup.title.string
	main_body = soup.find("div", {"id": "B"})
	f = open(filename, 'w')
	for a_tag in main_body.find_all("a"):
	href_str = a_tag.get("href")
	if href_str is not None:
	print("http://mangamura.org/new_pc_view" + href_str[1:])
	f.write("http://mangamura.org/new_pc_view" + href_str[1:] + "\r\n")
	f.close()

	def get_html_string(url):
	headers = {
	"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:47.0) Gecko/20100101 Firefox/47.0",
	}

	request = urllib.request.Request(url=url, headers=headers)
	response = urllib.request.urlopen(request)
	decoded_html = response.read().decode('utf-8')

	return decoded_html

	def main():
	crawring(sys.argv[1], sys.argv[2])

	if __name__ == "__main__":
	main()