Skip to content

Instantly share code, notes, and snippets.

@c3pmark
Created September 11, 2018 13:34
Show Gist options
  • Save c3pmark/30bab04dd2eb07167a0392aab7747c06 to your computer and use it in GitHub Desktop.
Save c3pmark/30bab04dd2eb07167a0392aab7747c06 to your computer and use it in GitHub Desktop.
from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
from scrapy.core.downloader.handlers.http11 import ScrapyAgent, reactor
# This overrides the default Scrapy agent, which discards partial responses when we
# hit the timeout, and forces it to instead return what we got back.
class TimeoutProcessingAgent(ScrapyAgent):
def download_request(self, request):
d = super().download_request(request)
timeout = request.meta.get('download_timeout') or self._connectTimeout
# Replace the default timeout callback, which will just cancel our request,
# with one that will drop the connection and allow us to process partial data
self._timeout_cl.cancel()
self._timeout_cl = reactor.callLater(timeout, self._cb_timedout)
return d
def _cb_timedout(self):
self._txresponse._transport.stopProducing()
self._txresponse._transport._producer.loseConnection()
def _cb_timeout(self, result, request, url, timeout):
# needed for HTTPS requests, otherwise _ResponseReader doesn't
# receive connectionLost()
if self._txresponse:
if self._txresponse._transport:
self._txresponse._transport.stopProducing()
return result
class TimeoutProcessingDownloadHandler(HTTPDownloadHandler):
def download_request(self, request, spider):
agent = TimeoutProcessingAgent(contextFactory=self._contextFactory, pool=self._pool,
maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
fail_on_dataloss=self._fail_on_dataloss)
return agent.download_request(request)
@pengisgood
Copy link

It works for me. Thanks.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment