Created
June 11, 2019 01:12
-
-
Save LenKIM/76b9c4dd95b2751f05a77b539f0412ac to your computer and use it in GitHub Desktop.
crawling_test.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
# !/usr/bin/env python3 | |
import re | |
from random import choice | |
import os.path | |
import requests | |
from lxml import etree | |
from bs4 import BeautifulSoup | |
import time | |
import util | |
mainUrl = "http://www.audikoreaevent.co.kr/etc/component/index1.jsp" | |
response_ok = 200 | |
class Audi: | |
desktop_agents = [ | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14', | |
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0'] | |
def url_to_object(self): | |
mainPage = requests.post(self) | |
str = mainPage.content | |
root = etree.HTML(str)[1] | |
return root | |
def get_model_list(self): | |
result = [] | |
_list = self.xpath("//select[@name='sModel']//option") | |
for element in _list: | |
if re.match('^(\w)', element.text): | |
modelName = element.text.replace(" ", "+") | |
result.append(modelName) | |
# print(modelName) | |
return result | |
def get_year_list(self): | |
result = [] | |
list = self.xpath("//select[@name='sYear']//option") | |
for element in list: | |
if re.match('^(\w)', element.text): | |
year = element.text | |
result.append(year) | |
return result | |
@staticmethod | |
def random_headers(desktop_agents): | |
return {'User-Agent': choice(desktop_agents), | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'} | |
def create_search_info(self, modelList, yearList): | |
for model in modelList: | |
if os.path.exists("./result/audi/audi_model_" + str(model) + ".csv"): | |
print('exists') | |
continue | |
else: | |
f = open("./result/audi/audi_model_" + str(model) + ".csv", 'w', encoding='utf-8') | |
for year in yearList: | |
should_stop = False | |
page = 1 | |
double_retry_connection = 1 | |
while True: | |
if should_stop: | |
break | |
try: | |
url = mainUrl + "?pageNo=" + str( | |
page) + "&sModel=" + str(model) + "&sYear=" + str(year) + "&sGroup=&sKey=&sValue=" | |
s = util.requests_retry_session() | |
response = s.request('get', url, headers=self.random_headers(self.desktop_agents), timeout=10) | |
if response.status_code == response_ok: | |
print('Ok') | |
double_retry_connection = 0 | |
main = BeautifulSoup(response.content.decode('utf-8', 'replace'), 'html.parser') | |
tables = main.findAll('table', {'summary': ''}) | |
trs = tables[1]('tr') | |
for tr in trs: | |
# print(tr('td')) | |
td = tr('td') | |
if len(td) <= 0: continue | |
if '검색된' in td[0].text: | |
should_stop = True | |
break | |
else: | |
name = util.defaultString(td[0].text.strip()) # 모델명 | |
group = util.defaultString(td[1].text.strip()) # 부품그룹 | |
part_no = util.defaultString(td[2].text.strip()) # 부품번호 | |
part_name_eng = util.defaultString(td[3].text.strip()) # 부품명(영어) | |
part_name_kor = util.defaultString(td[4].text.strip()) # 부품명(한글) | |
part_name_info = util.defaultString(td[5].text.strip()) # 세부정보 | |
price = util.defaultString(td[6].text.strip()) | |
date = util.defaultString(td[7].text.strip()) # 적용일자 | |
data = util.seperator.join( | |
[name, group, part_no, part_name_eng, part_name_kor, part_name_info, price, | |
date, | |
'\n']) | |
print(data) | |
f.write(data) | |
except requests.exceptions.Timeout as timeout: | |
print('Timeout = ' + str(timeout)) | |
continue | |
except Exception as e: | |
print('Audi : ' + str(e)) | |
double_retry_connection += 1 | |
if double_retry_connection >= util.MAX_RETRY_COUNT: | |
time.sleep(60 * double_retry_connection) | |
f.close() | |
exit() | |
continue | |
page = int(page) + 1 | |
f.close() | |
# '모델명' + '|' + '부품 그룹' + '|' + '부품 번호' + '|' + '부품명(영어)' + "|" + '부품명(한글)' + '|' + "세부정보" + "|" + "가격" + "|" + "적용일자" + "\n") | |
root = url_to_object(mainUrl) | |
modelList = get_model_list(root) | |
yearList = get_year_list(root) | |
modelList.pop(0) | |
yearList.pop(0) | |
# print(modelList) | |
# print(yearList) | |
try: | |
create_search_info(modelList, yearList) | |
except Exception as e: | |
print(str(e)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment