Skip to content

Instantly share code, notes, and snippets.

@KielD-01
Created August 9, 2018 13:35
Show Gist options
  • Save KielD-01/026f1b58223bd5a70eb9d5852103b6e5 to your computer and use it in GitHub Desktop.
Save KielD-01/026f1b58223bd5a70eb9d5852103b6e5 to your computer and use it in GitHub Desktop.
Parser
import os
import re
import requests
import lxml
import multiprocessing
from bs4 import BeautifulSoup
def transform_cache_url(url):
return re.sub(r"[^0-9a-zA-Z]", '_', url)
def transform_page_urls(text, url):
return text.replace(url, '')
class Trendsgal:
base_uri = 'https://www.trendsgal.com'
base_uri_to_replace = '//www.trendsgal.com'
user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' \
'Chrome/68.0.3440.84 Safari/537.36 '
request_headers = {}
requests_session = {}
categories = []
domParser = None
selectors = {
'menu_items': 'li[gao="bowen"] dl dd a'
}
cache_dir = './cached/'
def __init__(self):
self.request_headers['User-Agent'] = self.user_agent
self.requests_session = requests.Session()
self.requests_session.headers.update(self.request_headers)
if not os.path.exists(self.cache_dir):
os.makedirs(self.cache_dir, 0o755, True)
self.run()
pass
def run(self):
self.get_categories()
def _get_page(self, url=None, prefix=None):
url = self.base_uri + url
cached = self._get_cached_page(transform_cache_url(url), prefix)
if cached['status'] is True:
return cached['data']
response = self.requests_session.get(url)
dom_parser = BeautifulSoup(response.text, 'lxml')
self._set_cached_page(url, prefix, dom_parser)
return dom_parser
def _get_cached_page(self, url, prefix=None):
if prefix is None:
prefix = 'cache'
url = transform_cache_url(url)
cached_file = prefix + '_%(url)s' % {'url': url}
status = os.path.isfile(self.cache_dir + cached_file + '.html')
data = None
if status is True:
data = BeautifulSoup(open(self.cache_dir + cached_file + '.html', 'r').read(), 'lxml')
return {
'status': status,
'data': data
}
def _set_cached_page(self, url, prefix=None, content=None):
if prefix is None:
prefix = 'cache'
url = transform_cache_url(url)
cached_file = prefix + '_%(url)s' % {'url': url}
cached_file = open(self.cache_dir + cached_file + '.html', 'w')
cached_file.write(
transform_page_urls(
str(content.encode('utf-8')),
self.base_uri_to_replace
)
)
cached_file.close()
def get_categories(self):
self.domParser = self._get_page('/')
menu_items = self.domParser.select(self.selectors['menu_items'])
category_index = 1
for menu_item in menu_items:
category = {
'index': category_index,
'title': menu_item.text.replace('\\', ''),
'link': transform_page_urls(menu_item.attrs['href'], self.base_uri_to_replace),
'products': []
}
self.categories.append(category)
category_index += 1
self.proceed_categories()
def proceed_categories(self):
workers_jobs = []
for work in self.categories:
work_process = multiprocessing.Process(
target=self.proceed_category_products, args=(self, work.get('index'))
)
workers_jobs.append(work_process)
work_process.start()
def proceed_category_products(self, index):
category = self.categories[index]
pagination = {
'set': False,
'total': 1,
'current': 1
}
while pagination.get('current') <= pagination.get('total'):
page = ''
if pagination['current'] > 1:
page = 'p_%d' % pagination['current']
self._get_page('/%r' % page, 'category_%r_products_%r' % (category.get('title'), page))
pagination['current'] += 1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment