Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
45 changes: 32 additions & 13 deletions openwpm/browser_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,21 @@
from .task_manager import TaskManager


def is_dns_error(command_status: str, error_text: Optional[str]) -> bool:
"""Return True when the failure is a DNS resolution error (NXDOMAIN).

DNS resolution errors are expected when crawling large domain lists
(e.g. Tranco top-100k) and don't indicate a browser or instrumentation
failure. Only NXDOMAIN is excluded; DNS timeouts and SERVFAIL
intentionally still count.
"""
return (
command_status == "neterror"
and error_text is not None
and error_text == "dnsNotFound"
)


class BrowserManagerHandle:
"""The BrowserManagerHandle class is responsible for holding all the
configuration and status information on BrowserManager process
Expand Down Expand Up @@ -462,19 +477,23 @@ def execute_command_sequence(
return

if command_status != "ok":
with task_manager.threadlock:
task_manager.failure_count += 1
if task_manager.failure_count > task_manager.failure_limit:
self.logger.critical(
"BROWSER %i: Command execution failure pushes failure "
"count above the allowable limit. Setting "
"failure_status." % self.browser_id
)
task_manager.failure_status = {
"ErrorType": "ExceedCommandFailureLimit",
"CommandSequence": command_sequence,
}
return
if not is_dns_error(command_status, error_text):
with task_manager.threadlock:
task_manager.failure_count += 1
exceeded_limit = (
task_manager.failure_count > task_manager.failure_limit
)
if exceeded_limit:
self.logger.critical(
"BROWSER %i: Command execution failure pushes failure "
"count above the allowable limit. Setting "
"failure_status." % self.browser_id
)
task_manager.failure_status = {
"ErrorType": "ExceedCommandFailureLimit",
"CommandSequence": command_sequence,
}
return
self.restart_required = True
self.logger.debug(
"BROWSER %i: Browser restart required" % self.browser_id
Expand Down
78 changes: 78 additions & 0 deletions test/test_webdriver_utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
import pytest

from openwpm.browser_manager import is_dns_error
from openwpm.commands.utils.webdriver_utils import parse_neterror
from openwpm.utilities import db_utils

Expand Down Expand Up @@ -26,3 +29,78 @@ def test_parse_neterror_integration(default_params, task_manager_creator):

assert get_command[0] == "neterror"
assert get_command[1] == "dnsNotFound"


@pytest.mark.slow
def test_dns_error_does_not_count_against_failure_limit(
default_params, task_manager_creator
):
"""DNS errors past failure_limit must not trigger CommandExecutionError.

Each navigation to a non-existent domain produces a dnsNotFound neterror
that should be excluded from the failure counter.
"""
manager_params, browser_params = default_params
manager_params.num_browsers = 1
manager_params.failure_limit = 5
manager, db = task_manager_creator((manager_params, browser_params[:1]))

num_domains = manager_params.failure_limit * 3
try:
for i in range(num_domains):
manager.get(f"http://domain-{i}.invalid")
finally:
manager.close()

rows = db_utils.query_db(
db,
"SELECT command_status, error FROM crawl_history WHERE command='GetCommand'",
as_tuple=True,
)
dns_rows = [(s, e) for s, e in rows if s == "neterror" and e == "dnsNotFound"]
assert len(dns_rows) == num_domains


def test_is_dns_error_predicate():
"""Verify that is_dns_error is True only for dnsNotFound neterrors.

Non-DNS neterror types (connectionRefused, netOffline, etc.) and other
command statuses must NOT be treated as DNS errors, so they continue to
increment failure_count as before.
"""
# Only this exact combination qualifies
assert is_dns_error("neterror", "dnsNotFound") is True

# Other neterror types must NOT be exempt
assert is_dns_error("neterror", "connectionRefused") is False
assert is_dns_error("neterror", "netOffline") is False
assert is_dns_error("neterror", "proxyConnectFailure") is False

# Missing error_text must NOT be exempt
assert is_dns_error("neterror", None) is False

# Non-neterror statuses must NOT be exempt
assert is_dns_error("ok", "dnsNotFound") is False
assert is_dns_error("critical", "dnsNotFound") is False
assert is_dns_error("error", "dnsNotFound") is False
assert is_dns_error("timeout", "dnsNotFound") is False

# parse_neterror returns the right code for a DNS failure message
dns_msg = (
"selenium.common.exceptions.WebDriverException: "
"Message: Reached error page: "
"about:neterror?e=dnsNotFound&u=http%3A//missing.example/&c=UTF-8&"
"f=regular&d=We+can%27t+connect"
)
assert parse_neterror(dns_msg) == "dnsNotFound"
assert is_dns_error("neterror", parse_neterror(dns_msg)) is True

# A different neterror code does NOT trigger the exemption
conn_msg = (
"selenium.common.exceptions.WebDriverException: "
"Message: Reached error page: "
"about:neterror?e=connectionRefused&u=http%3A//localhost%3A1/&c=UTF-8&"
"f=regular&d=refused."
)
assert parse_neterror(conn_msg) == "connectionRefused"
assert is_dns_error("neterror", parse_neterror(conn_msg)) is False
Loading