c3pmark · September 11, 2018 13:34 · pengisgood · Aug 14, 2019
diff --git a/handlers.py b/handlers.py
 from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
 from scrapy.core.downloader.handlers.http11 import ScrapyAgent, reactor


 # This overrides the default Scrapy agent, which discards partial responses when we
 # hit the timeout, and forces it to instead return what we got back.


 class TimeoutProcessingAgent(ScrapyAgent):
    def download_request(self, request):
        d = super().download_request(request)
        timeout = request.meta.get('download_timeout') or self._connectTimeout
        # Replace the default timeout callback, which will just cancel our request,
        # with one that will drop the connection and allow us to process partial data
        self._timeout_cl.cancel()
        self._timeout_cl = reactor.callLater(timeout, self._cb_timedout)
        return d

    def _cb_timedout(self):
        self._txresponse._transport.stopProducing()
        self._txresponse._transport._producer.loseConnection()

    def _cb_timeout(self, result, request, url, timeout):
        # needed for HTTPS requests, otherwise _ResponseReader doesn't
        # receive connectionLost()
        if self._txresponse:
            if self._txresponse._transport:
                self._txresponse._transport.stopProducing()
        return result


 class TimeoutProcessingDownloadHandler(HTTPDownloadHandler):
    def download_request(self, request, spider):
        agent = TimeoutProcessingAgent(contextFactory=self._contextFactory, pool=self._pool,
                                       maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
                                       warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
                                       fail_on_dataloss=self._fail_on_dataloss)
        return agent.download_request(request)
	from scrapy.core.downloader.handlers.http import HTTPDownloadHandler
	from scrapy.core.downloader.handlers.http11 import ScrapyAgent, reactor


	# This overrides the default Scrapy agent, which discards partial responses when we
	# hit the timeout, and forces it to instead return what we got back.


	class TimeoutProcessingAgent(ScrapyAgent):
	def download_request(self, request):
	d = super().download_request(request)
	timeout = request.meta.get('download_timeout') or self._connectTimeout
	# Replace the default timeout callback, which will just cancel our request,
	# with one that will drop the connection and allow us to process partial data
	self._timeout_cl.cancel()
	self._timeout_cl = reactor.callLater(timeout, self._cb_timedout)
	return d

	def _cb_timedout(self):
	self._txresponse._transport.stopProducing()
	self._txresponse._transport._producer.loseConnection()

	def _cb_timeout(self, result, request, url, timeout):
	# needed for HTTPS requests, otherwise _ResponseReader doesn't
	# receive connectionLost()
	if self._txresponse:
	if self._txresponse._transport:
	self._txresponse._transport.stopProducing()
	return result


	class TimeoutProcessingDownloadHandler(HTTPDownloadHandler):
	def download_request(self, request, spider):
	agent = TimeoutProcessingAgent(contextFactory=self._contextFactory, pool=self._pool,
	maxsize=getattr(spider, 'download_maxsize', self._default_maxsize),
	warnsize=getattr(spider, 'download_warnsize', self._default_warnsize),
	fail_on_dataloss=self._fail_on_dataloss)
	return agent.download_request(request)