Skip to content

Instantly share code, notes, and snippets.

@LenKIM
Created June 11, 2019 01:12
Show Gist options
  • Save LenKIM/76b9c4dd95b2751f05a77b539f0412ac to your computer and use it in GitHub Desktop.
Save LenKIM/76b9c4dd95b2751f05a77b539f0412ac to your computer and use it in GitHub Desktop.
crawling_test.py
# -*- coding: utf-8 -*-
# !/usr/bin/env python3
import re
from random import choice
import os.path
import requests
from lxml import etree
from bs4 import BeautifulSoup
import time
import util
mainUrl = "http://www.audikoreaevent.co.kr/etc/component/index1.jsp"
response_ok = 200
class Audi:
desktop_agents = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.98 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:50.0) Gecko/20100101 Firefox/50.0']
def url_to_object(self):
mainPage = requests.post(self)
str = mainPage.content
root = etree.HTML(str)[1]
return root
def get_model_list(self):
result = []
_list = self.xpath("//select[@name='sModel']//option")
for element in _list:
if re.match('^(\w)', element.text):
modelName = element.text.replace(" ", "+")
result.append(modelName)
# print(modelName)
return result
def get_year_list(self):
result = []
list = self.xpath("//select[@name='sYear']//option")
for element in list:
if re.match('^(\w)', element.text):
year = element.text
result.append(year)
return result
@staticmethod
def random_headers(desktop_agents):
return {'User-Agent': choice(desktop_agents),
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'}
def create_search_info(self, modelList, yearList):
for model in modelList:
if os.path.exists("./result/audi/audi_model_" + str(model) + ".csv"):
print('exists')
continue
else:
f = open("./result/audi/audi_model_" + str(model) + ".csv", 'w', encoding='utf-8')
for year in yearList:
should_stop = False
page = 1
double_retry_connection = 1
while True:
if should_stop:
break
try:
url = mainUrl + "?pageNo=" + str(
page) + "&sModel=" + str(model) + "&sYear=" + str(year) + "&sGroup=&sKey=&sValue="
s = util.requests_retry_session()
response = s.request('get', url, headers=self.random_headers(self.desktop_agents), timeout=10)
if response.status_code == response_ok:
print('Ok')
double_retry_connection = 0
main = BeautifulSoup(response.content.decode('utf-8', 'replace'), 'html.parser')
tables = main.findAll('table', {'summary': ''})
trs = tables[1]('tr')
for tr in trs:
# print(tr('td'))
td = tr('td')
if len(td) <= 0: continue
if '검색된' in td[0].text:
should_stop = True
break
else:
name = util.defaultString(td[0].text.strip()) # 모델명
group = util.defaultString(td[1].text.strip()) # 부품그룹
part_no = util.defaultString(td[2].text.strip()) # 부품번호
part_name_eng = util.defaultString(td[3].text.strip()) # 부품명(영어)
part_name_kor = util.defaultString(td[4].text.strip()) # 부품명(한글)
part_name_info = util.defaultString(td[5].text.strip()) # 세부정보
price = util.defaultString(td[6].text.strip())
date = util.defaultString(td[7].text.strip()) # 적용일자
data = util.seperator.join(
[name, group, part_no, part_name_eng, part_name_kor, part_name_info, price,
date,
'\n'])
print(data)
f.write(data)
except requests.exceptions.Timeout as timeout:
print('Timeout = ' + str(timeout))
continue
except Exception as e:
print('Audi : ' + str(e))
double_retry_connection += 1
if double_retry_connection >= util.MAX_RETRY_COUNT:
time.sleep(60 * double_retry_connection)
f.close()
exit()
continue
page = int(page) + 1
f.close()
# '모델명' + '|' + '부품 그룹' + '|' + '부품 번호' + '|' + '부품명(영어)' + "|" + '부품명(한글)' + '|' + "세부정보" + "|" + "가격" + "|" + "적용일자" + "\n")
root = url_to_object(mainUrl)
modelList = get_model_list(root)
yearList = get_year_list(root)
modelList.pop(0)
yearList.pop(0)
# print(modelList)
# print(yearList)
try:
create_search_info(modelList, yearList)
except Exception as e:
print(str(e))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment