diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 1eb0b4b78..ffea008e7 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -43,6 +43,21 @@ from .task_manager import TaskManager +def is_dns_error(command_status: str, error_text: Optional[str]) -> bool: + """Return True when the failure is a DNS resolution error (NXDOMAIN). + + DNS resolution errors are expected when crawling large domain lists + (e.g. Tranco top-100k) and don't indicate a browser or instrumentation + failure. Only NXDOMAIN is excluded; DNS timeouts and SERVFAIL + intentionally still count. + """ + return ( + command_status == "neterror" + and error_text is not None + and error_text == "dnsNotFound" + ) + + class BrowserManagerHandle: """The BrowserManagerHandle class is responsible for holding all the configuration and status information on BrowserManager process @@ -462,19 +477,23 @@ def execute_command_sequence( return if command_status != "ok": - with task_manager.threadlock: - task_manager.failure_count += 1 - if task_manager.failure_count > task_manager.failure_limit: - self.logger.critical( - "BROWSER %i: Command execution failure pushes failure " - "count above the allowable limit. Setting " - "failure_status." % self.browser_id - ) - task_manager.failure_status = { - "ErrorType": "ExceedCommandFailureLimit", - "CommandSequence": command_sequence, - } - return + if not is_dns_error(command_status, error_text): + with task_manager.threadlock: + task_manager.failure_count += 1 + exceeded_limit = ( + task_manager.failure_count > task_manager.failure_limit + ) + if exceeded_limit: + self.logger.critical( + "BROWSER %i: Command execution failure pushes failure " + "count above the allowable limit. Setting " + "failure_status." % self.browser_id + ) + task_manager.failure_status = { + "ErrorType": "ExceedCommandFailureLimit", + "CommandSequence": command_sequence, + } + return self.restart_required = True self.logger.debug( "BROWSER %i: Browser restart required" % self.browser_id diff --git a/test/test_webdriver_utils.py b/test/test_webdriver_utils.py index a9f518f8f..d9b22a377 100644 --- a/test/test_webdriver_utils.py +++ b/test/test_webdriver_utils.py @@ -1,3 +1,6 @@ +import pytest + +from openwpm.browser_manager import is_dns_error from openwpm.commands.utils.webdriver_utils import parse_neterror from openwpm.utilities import db_utils @@ -26,3 +29,78 @@ def test_parse_neterror_integration(default_params, task_manager_creator): assert get_command[0] == "neterror" assert get_command[1] == "dnsNotFound" + + +@pytest.mark.slow +def test_dns_error_does_not_count_against_failure_limit( + default_params, task_manager_creator +): + """DNS errors past failure_limit must not trigger CommandExecutionError. + + Each navigation to a non-existent domain produces a dnsNotFound neterror + that should be excluded from the failure counter. + """ + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + manager_params.failure_limit = 5 + manager, db = task_manager_creator((manager_params, browser_params[:1])) + + num_domains = manager_params.failure_limit * 3 + try: + for i in range(num_domains): + manager.get(f"http://domain-{i}.invalid") + finally: + manager.close() + + rows = db_utils.query_db( + db, + "SELECT command_status, error FROM crawl_history WHERE command='GetCommand'", + as_tuple=True, + ) + dns_rows = [(s, e) for s, e in rows if s == "neterror" and e == "dnsNotFound"] + assert len(dns_rows) == num_domains + + +def test_is_dns_error_predicate(): + """Verify that is_dns_error is True only for dnsNotFound neterrors. + + Non-DNS neterror types (connectionRefused, netOffline, etc.) and other + command statuses must NOT be treated as DNS errors, so they continue to + increment failure_count as before. + """ + # Only this exact combination qualifies + assert is_dns_error("neterror", "dnsNotFound") is True + + # Other neterror types must NOT be exempt + assert is_dns_error("neterror", "connectionRefused") is False + assert is_dns_error("neterror", "netOffline") is False + assert is_dns_error("neterror", "proxyConnectFailure") is False + + # Missing error_text must NOT be exempt + assert is_dns_error("neterror", None) is False + + # Non-neterror statuses must NOT be exempt + assert is_dns_error("ok", "dnsNotFound") is False + assert is_dns_error("critical", "dnsNotFound") is False + assert is_dns_error("error", "dnsNotFound") is False + assert is_dns_error("timeout", "dnsNotFound") is False + + # parse_neterror returns the right code for a DNS failure message + dns_msg = ( + "selenium.common.exceptions.WebDriverException: " + "Message: Reached error page: " + "about:neterror?e=dnsNotFound&u=http%3A//missing.example/&c=UTF-8&" + "f=regular&d=We+can%27t+connect" + ) + assert parse_neterror(dns_msg) == "dnsNotFound" + assert is_dns_error("neterror", parse_neterror(dns_msg)) is True + + # A different neterror code does NOT trigger the exemption + conn_msg = ( + "selenium.common.exceptions.WebDriverException: " + "Message: Reached error page: " + "about:neterror?e=connectionRefused&u=http%3A//localhost%3A1/&c=UTF-8&" + "f=regular&d=refused." + ) + assert parse_neterror(conn_msg) == "connectionRefused" + assert is_dns_error("neterror", parse_neterror(conn_msg)) is False