Created
December 11, 2020 11:23
-
-
Save eupendra/e7f72904d9a561adb6e44fae16af64ec to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import scrapy | |
from scrapy.utils.response import open_in_browser | |
def get_headers(s, sep=': ', strip_cookie=True, strip_cl=True, strip_headers: list = []) -> dict(): | |
d = dict() | |
for kv in s.split('\n'): | |
kv = kv.strip() | |
if kv and sep in kv: | |
v='' | |
k = kv.split(sep)[0] | |
if len(kv.split(sep)) == 1: | |
v = '' | |
else: | |
v = kv.split(sep)[1] | |
if v == '\'\'': | |
v ='' | |
# v = kv.split(sep)[1] | |
if strip_cookie and k.lower() == 'cookie': continue | |
if strip_cl and k.lower() == 'content-length': continue | |
if k in strip_headers: continue | |
d[k] = v | |
return d | |
class WsSpider(scrapy.Spider): | |
name = 'ws' | |
def start_requests(self): | |
h = get_headers(''' | |
accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9 | |
accept-encoding: gzip, deflate, br | |
accept-language: en-US,en;q=0.9 | |
sec-fetch-dest: document | |
sec-fetch-mode: navigate | |
sec-fetch-site: none | |
upgrade-insecure-requests: 1 | |
user-agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36 | |
cookie: cookie_enabled=true; ID=MJHDC7TT2D70BWF; IDPWD=I55722139; COOKIE_ID=MJHDC7TT2D70BWF; visit=MJHDC7TT2D70BWF%7C20201211094710%7C%2Fbiz%2Fproducers%2Fcanada%7C%7Cend%20; user_status=A%7C; fflag=flag_store_manager%3A0%2Cend; _csrf=9iauzg0wH9qtx0aJNWZfufQyKrOGIxPo; _pxhd=996f3f5938fe57b8b25d1537c6cd623b8244d92d58ffa16bc4bcc83076669d8c:d683a651-3b95-11eb-9210-31942c6b15e0; _pxvid=d683a651-3b95-11eb-9210-31942c6b15e0; _gid=GA1.2.1112469370.1607680033; _ga_M0W3BEYMXL=GS1.1.1607680032.1.1.1607680234.0; _ga=GA1.2.1736177304.1607680033; __gads=ID=03e4454d1a5b70bb:T=1607680036:S=ALNI_MbKY38CaYSCQKSCLAGi_bDFdqg_FQ; _px3=ff767f5546c27c337924d2df0becd8fdfacf7a0daf4b3a3c711597f908d693b6:Kykj5xcibzza3GO1XxKPXIRm2kpApik8o2Rbao62N1hkOI9p1zeqNHVuWq+dbkGTzZk4B9JKS0iE32PEedrqMA==:1000:nJXdHhIDcTcyI0rMJsQi87T+ABu9tKZiCOVSrHzKQodKjvMo6jf9MkpN06BHLqX4xdvRM/Fe5waFUHzxPOZOBeN/pLWwBXnVWt69BLIq7bjj0RLeTeCCLsky4lxOzxgrSZz+zd/FALzdyrTvennzpAQTDVQBeAwzMLKqJK8x9p0=; _px2=eyJ1IjoiNTBhMzNhZTAtM2I5Ni0xMWViLTlkZmMtNGYyNTM0MDhlYzU1IiwidiI6ImQ2ODNhNjUxLTNiOTUtMTFlYi05MjEwLTMxOTQyYzZiMTVlMCIsInQiOjE2MDc2ODEyNTk1NDcsImgiOiIwOGViY2U1OWIwOWM0MzA2MjQyNzRiZTQ1NzA2NWJmZTdkNTdkODMyOWQ3ZDYxNDA4OGMyYTFlNzRkNTBjZDk1In0=; _pxde=5eef4b6944a88ff4b800d3fc7d412d77240d66691e17ac42b7b9026dc4a4ab0e:eyJ0aW1lc3RhbXAiOjE2MDc2ODA5NTk1NDcsImZfa2IiOjAsImlwY19pZCI6W119 | |
''') | |
req = scrapy.Request('https://www.wine-searcher.com/biz/producers/canada',headers=h) | |
yield req | |
def parse(self, response): | |
open_in_browser(response) |
Hi;
I got the answer to my original question by watching another video of yours: One just needs to add the process statements to run the code.
Thanks again for posting your videos.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi; I'm new to scrrapy, but not to web scraping (VBA, python beautiful soup, etc). I tried running your code in windows vscode. I did not get any results nor error messages. In debug mode the code the never enters the functions in the wsSpider class. The only variable that gets set is name = "ws". Any help would be greatly appreciated.