Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ jobs:
cache: pip
- name: Install dependencies
run: |
sudo apt update -y && sudo apt install -y gcc-9
python -m pip install --upgrade pip
pip install tox codecov
- name: Run tests
Expand Down
2 changes: 1 addition & 1 deletion docs/headers.rst
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Translation is supported for the following headers:
Zyte API Zyte Smart Proxy Manager
========================= ===========================
``Zyte-Device`` ``X-Crawlera-Profile``
``Zyte-Error`` ``X-Crawlera-Error``
``Zyte-Error-Type`` ``X-Crawlera-Error``
``Zyte-Geolocation`` ``X-Crawlera-Region``
``Zyte-JobId`` ``X-Crawlera-JobId``
``Zyte-Override-Headers`` ``X-Crawlera-Profile-Pass``
Expand Down
12 changes: 6 additions & 6 deletions scrapy_zyte_smartproxy/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def _is_banned(self, response):
return (
response.status == self.ban_code
and response.headers.get("X-Crawlera-Error") == b"banned"
) or (response.status in {520, 521} and response.headers.get("Zyte-Error"))
) or (response.status in {520, 521} and response.headers.get("Zyte-Error-Type"))

def _is_auth_error(self, response):
return (
Expand All @@ -316,21 +316,21 @@ def _is_auth_error(self, response):
)

def _throttle_error(self, response):
error = response.headers.get("Zyte-Error") or response.headers.get(
error = response.headers.get("Zyte-Error-Type") or response.headers.get(
"X-Crawlera-Error"
)
if response.status in {429, 503} and error and error != b"banned":
return error.decode("utf-8")
return None

def _process_error(self, response):
if "Zyte-Error" in response.headers:
value = response.headers.get("Zyte-Error")
if "Zyte-Error-Type" in response.headers:
value = response.headers.get("Zyte-Error-Type")
response.headers["X-Crawlera-Error"] = value
return value
if "X-Crawlera-Error" in response.headers:
value = response.headers.get("X-Crawlera-Error")
response.headers["Zyte-Error"] = value
response.headers["Zyte-Error-Type"] = value
return value
return None

Expand Down Expand Up @@ -480,10 +480,10 @@ def _get_url_domain(self, url):
return parsed.netloc

def _is_zyte_smartproxy_or_zapi_response(self, response):
"""Check if is Smart Proxy Manager or Zyte API proxy mode response"""
return (
"X-Crawlera-Version" in response.headers
or "Zyte-Request-Id" in response.headers
or "zyte-error-type" in response.headers
)

def _get_slot_key(self, request):
Expand Down
34 changes: 23 additions & 11 deletions tests/test_all.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
("X-Crawlera-Version", ""),
("X-Crawlera-Version", "1.36.3-cd5e44"),
("Zyte-Request-Id", "123456789"),
("zyte-error-type", "foo"),
)


Expand Down Expand Up @@ -160,7 +159,7 @@ def _assert_enabled(
)
assert mw.process_response(req, res, spider) is res
assert res.headers["X-Crawlera-Error"] == b"banned"
assert res.headers["Zyte-Error"] == b"banned"
assert res.headers["Zyte-Error-Type"] == b"banned"

# max bans reached and close_spider called
self.assertEqual(crawler.engine.fake_spider_closed_result, (spider, "banned"))
Expand Down Expand Up @@ -482,7 +481,7 @@ def _test_stats(self, settings, prefix):
)

res = self._mock_zyte_smartproxy_response(
req.url, status=mw.ban_code, headers={"Zyte-Error": "somethingbad"}
req.url, status=mw.ban_code, headers={"Zyte-Error-Type": "somethingbad"}
)
assert mw.process_response(req, res, spider) is res
self.assertEqual(crawler.stats.get_value("{}/response".format(prefix)), 2)
Expand All @@ -497,7 +496,7 @@ def _test_stats(self, settings, prefix):
crawler.stats.get_value("{}/response/error/somethingbad".format(prefix)), 1
)
self.assertEqual(res.headers["X-Crawlera-Error"], b"somethingbad")
self.assertEqual(res.headers["Zyte-Error"], b"somethingbad")
self.assertEqual(res.headers["Zyte-Error-Type"], b"somethingbad")

res = self._mock_zyte_smartproxy_response(
req.url,
Expand All @@ -516,7 +515,7 @@ def _test_stats(self, settings, prefix):
crawler.stats.get_value("{}/response/banned".format(prefix)), 1
)
self.assertEqual(res.headers["X-Crawlera-Error"], b"banned")
self.assertEqual(res.headers["Zyte-Error"], b"banned")
self.assertEqual(res.headers["Zyte-Error-Type"], b"banned")

res = self._mock_zyte_smartproxy_response(
req.url,
Expand Down Expand Up @@ -672,7 +671,10 @@ def test_is_banned(self):
res = Response(
req.url,
status=503,
headers={"Zyte-Error": "/limits/over-global-limit"},
headers={
"Zyte-Request-Id": "123456789",
"Zyte-Error-Type": "/limits/over-global-limit",
},
)
res = mw.process_response(req, res, self.spider)
self.assertFalse(mw._is_banned(res))
Expand All @@ -681,16 +683,26 @@ def test_is_banned(self):
res = mw.process_response(req, res, self.spider)
self.assertTrue(mw._is_banned(res))
res = Response(
req.url, status=520, headers={"Zyte-Error": "/download/temporary-error"}
req.url,
status=520,
headers={
"Zyte-Request-Id": "123456789",
"Zyte-Error-Type": "/download/temporary-error",
},
)
res = mw.process_response(req, res, self.spider)
assert mw.crawler.stats.get_value("zyte_smartproxy/response/banned") == 1
self.assertTrue(mw._is_banned(res))
res = Response(
req.url,
status=521,
headers={"Zyte-Error": "/download/internal-error"},
headers={
"Zyte-Request-Id": "123456789",
"Zyte-Error-Type": "/download/internal-error",
},
)
res = mw.process_response(req, res, self.spider)
assert mw.crawler.stats.get_value("zyte_smartproxy/response/banned") == 2
self.assertTrue(mw._is_banned(res))

@patch("random.uniform")
Expand Down Expand Up @@ -733,23 +745,23 @@ def test_noslaves_delays(self, random_uniform_patch):
over_use_limit_response = self._mock_zyte_smartproxy_response(
ban_url,
status=429,
headers={"Zyte-Error": "/limits/over-user-limit"},
headers={"Zyte-Error-Type": "/limits/over-user-limit"},
)
mw.process_response(noslaves_req, over_use_limit_response, self.spider)
self.assertEqual(slot.delay, backoff_step * 2**1)

over_domain_limit_response = self._mock_zyte_smartproxy_response(
ban_url,
status=429,
headers={"Zyte-Error": "/limits/over-domain-limit"},
headers={"Zyte-Error-Type": "/limits/over-domain-limit"},
)
mw.process_response(noslaves_req, over_domain_limit_response, self.spider)
self.assertEqual(slot.delay, backoff_step * 2**2)

over_global_limit_response = self._mock_zyte_smartproxy_response(
ban_url,
status=503,
headers={"Zyte-Error": "/limits/over-global-limit"},
headers={"Zyte-Error-Type": "/limits/over-global-limit"},
)
mw.process_response(noslaves_req, over_global_limit_response, self.spider)
self.assertEqual(slot.delay, max_delay)
Expand Down
4 changes: 2 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ commands =
[testenv:twinecheck]
basepython = python3
deps =
twine==5.1.1
build==1.2.2
twine==6.1.0
build==1.2.2.post1
commands =
python -m build --sdist
twine check dist/*
Loading