Skip to content

Instantly share code, notes, and snippets.

@vural
Created March 12, 2018 19:52
Show Gist options
  • Save vural/3c5065cf0e20871501b0bada2967f19a to your computer and use it in GitHub Desktop.
Save vural/3c5065cf0e20871501b0bada2967f19a to your computer and use it in GitHub Desktop.
megep
import requests
from bs4 import BeautifulSoup
class Parser(object):
__slots__ = ('_view_state', '_view_state_generator')
def __init__(self):
main_page_resp = requests.get('http://megep.meb.gov.tr/?page=moduller').text
soup = BeautifulSoup(main_page_resp, 'html.parser')
self._view_state = soup.find(id="__VIEWSTATE").get('value')
self._view_state_generator = soup.find(id="__VIEWSTATEGENERATOR").get('value')
def parse_category(self):
category_params = (
('tip', 'modulAlanDoldur'),
('id', '0'),
)
resp_category = requests.get('http://megep.meb.gov.tr/JSON.aspx', params=category_params).json()
return resp_category['Head']
def parse_item(self, id):
detail_params = (
('tip', 'alanChange'),
('id', id),
)
resp = requests.get('http://megep.meb.gov.tr/JSON.aspx', params=detail_params).json()
return resp['Head']
def parse_document(self, category_id, sub_category_id):
headers = {
'Origin': 'http://megep.meb.gov.tr',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'tr,en-US;q=0.9,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.146 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Accept': '*/*',
'Cache-Control': 'no-cache',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
'X-MicrosoftAjax': 'Delta=true',
'Referer': 'http://megep.meb.gov.tr/?page=moduller',
}
data = [
('ScriptManager1', 'ScriptManager1|ctl10$drpDersListe'),
('__EVENTTARGET', 'ctl10$drpDersListe'),
('__EVENTARGUMENT', ''),
('__LASTFOCUS', ''),
('__VIEWSTATE', self._view_state),
('__VIEWSTATEGENERATOR', self._view_state_generator),
('ctl10$drpModulAlanListe', category_id),
('ctl10$hdnAlanId', category_id),
('ctl10$drpDersListe', sub_category_id),
('ctl10$hdnDersId', sub_category_id),
('ctl10$txtModulAd', ''),
('ctl10$txtModulKod', ''),
('ctl10$drpModulAlanListe_2015', ''),
('ctl10$hdnAlanId_2015', ''),
('ctl10$drpDersListe_2015', ''),
('ctl10$hdnDersId_2015', '0'),
('__ASYNCPOST', 'true'),
('', ''),
]
docs_resp = requests.post('http://megep.meb.gov.tr/', headers=headers, params={'page':'moduller'}, data=data)
soup = BeautifulSoup(docs_resp.text, 'html.parser')
for link in soup.find_all('a'):
yield 'http://megep.meb.gov.tr/{}'.format(link.get("href"))
def parse(self):
for category in self.parse_category():
category_id, category_name = category['ID'], category['alanAd']
for item in self.parse_item(category_id):
item_id, item_name = item['ID'], item['dersAd']
for document in self.parse_document(category_id, item_id):
yield {
'main_category_name': category_name,
'sub_category_name': item_name,
'document': document
}
parser = Parser()
for i in parser.parse():
print(i)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment