From f9610a6e6ce7b81487c8074e00d2710143346f0d Mon Sep 17 00:00:00 2001 From: vringar Date: Fri, 3 Apr 2026 00:12:31 +0000 Subject: [PATCH 1/5] refactor(browser): extract dns error predicate for testability Extract the inline is_dns_error predicate from execute_command_sequence into a module-level _is_dns_error() function so the test imports and exercises the real code instead of a local copy. Remove unused CommandExecutionError import from the test file. --- openwpm/browser_manager.py | 19 ++++++++- test/test_webdriver_utils.py | 75 ++++++++++++++++++++++++++++++++++++ 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 1eb0b4b78f..59af33cbd1 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -43,6 +43,20 @@ from .task_manager import TaskManager +def _is_dns_error(command_status: str, error_text: Optional[str]) -> bool: + """Return True when the failure is a DNS resolution error (NXDOMAIN). + + DNS resolution errors are expected when crawling large domain lists + (e.g. Tranco top-100k) and don't indicate a browser or instrumentation + failure. # Only NXDOMAIN; DNS timeouts/SERVFAIL intentionally still count + """ + return ( + command_status == "neterror" + and error_text is not None + and error_text == "dnsNotFound" + ) + + class BrowserManagerHandle: """The BrowserManagerHandle class is responsible for holding all the configuration and status information on BrowserManager process @@ -462,8 +476,9 @@ def execute_command_sequence( return if command_status != "ok": - with task_manager.threadlock: - task_manager.failure_count += 1 + if not _is_dns_error(command_status, error_text): + with task_manager.threadlock: + task_manager.failure_count += 1 if task_manager.failure_count > task_manager.failure_limit: self.logger.critical( "BROWSER %i: Command execution failure pushes failure " diff --git a/test/test_webdriver_utils.py b/test/test_webdriver_utils.py index a9f518f8fb..87cffafa7d 100644 --- a/test/test_webdriver_utils.py +++ b/test/test_webdriver_utils.py @@ -1,3 +1,6 @@ +import pytest + +from openwpm.browser_manager import _is_dns_error from openwpm.commands.utils.webdriver_utils import parse_neterror from openwpm.utilities import db_utils @@ -26,3 +29,75 @@ def test_parse_neterror_integration(default_params, task_manager_creator): assert get_command[0] == "neterror" assert get_command[1] == "dnsNotFound" + + +@pytest.mark.slow +def test_dns_error_does_not_count_against_failure_limit( + default_params, task_manager_creator +): + """AC-4: 100+ DNS errors must not trigger CommandExecutionError even with + a low failure_limit. Each navigation to a non-existent domain produces a + dnsNotFound neterror that should be excluded from the failure counter.""" + manager_params, browser_params = default_params + manager_params.num_browsers = 1 + manager_params.failure_limit = 5 + manager, db = task_manager_creator((manager_params, browser_params[:1])) + + num_domains = 110 + for i in range(num_domains): + manager.get(f"http://domain-{i}.invalid") + + manager.close() + + rows = db_utils.query_db( + db, + "SELECT command_status, error FROM crawl_history WHERE command='GetCommand'", + as_tuple=True, + ) + dns_rows = [(s, e) for s, e in rows if s == "neterror" and e == "dnsNotFound"] + assert len(dns_rows) == num_domains + + +def test_is_dns_error_predicate(): + """AC-5: Verify that _is_dns_error is True only for dnsNotFound neterrors. + + Non-DNS neterror types (connectionRefused, netOffline, etc.) and other + command statuses must NOT be treated as DNS errors, so they continue to + increment failure_count as before. + """ + # Only this exact combination qualifies + assert _is_dns_error("neterror", "dnsNotFound") is True + + # Other neterror types must NOT be exempt + assert _is_dns_error("neterror", "connectionRefused") is False + assert _is_dns_error("neterror", "netOffline") is False + assert _is_dns_error("neterror", "proxyConnectFailure") is False + + # Missing error_text must NOT be exempt + assert _is_dns_error("neterror", None) is False + + # Non-neterror statuses must NOT be exempt + assert _is_dns_error("ok", "dnsNotFound") is False + assert _is_dns_error("critical", "dnsNotFound") is False + assert _is_dns_error("error", "dnsNotFound") is False + assert _is_dns_error("timeout", "dnsNotFound") is False + + # parse_neterror returns the right code for a DNS failure message + dns_msg = ( + "selenium.common.exceptions.WebDriverException: " + "Message: Reached error page: " + "about:neterror?e=dnsNotFound&u=http%3A//missing.example/&c=UTF-8&" + "f=regular&d=We+can%27t+connect" + ) + assert parse_neterror(dns_msg) == "dnsNotFound" + assert _is_dns_error("neterror", parse_neterror(dns_msg)) is True + + # A different neterror code does NOT trigger the exemption + conn_msg = ( + "selenium.common.exceptions.WebDriverException: " + "Message: Reached error page: " + "about:neterror?e=connectionRefused&u=http%3A//localhost%3A1/&c=UTF-8&" + "f=regular&d=refused." + ) + assert parse_neterror(conn_msg) == "connectionRefused" + assert _is_dns_error("neterror", parse_neterror(conn_msg)) is False From d3ed507d02e2f13e9a4a7a14ae3ebcea2f26c6d8 Mon Sep 17 00:00:00 2001 From: vringar Date: Tue, 14 Apr 2026 20:34:39 +0000 Subject: [PATCH 2/5] refactor(browser): clean up dns error predicate naming and docstring --- openwpm/browser_manager.py | 7 ++++--- test/test_webdriver_utils.py | 28 ++++++++++++++-------------- 2 files changed, 18 insertions(+), 17 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 59af33cbd1..31eacf7076 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -43,12 +43,13 @@ from .task_manager import TaskManager -def _is_dns_error(command_status: str, error_text: Optional[str]) -> bool: +def is_dns_error(command_status: str, error_text: Optional[str]) -> bool: """Return True when the failure is a DNS resolution error (NXDOMAIN). DNS resolution errors are expected when crawling large domain lists (e.g. Tranco top-100k) and don't indicate a browser or instrumentation - failure. # Only NXDOMAIN; DNS timeouts/SERVFAIL intentionally still count + failure. Only NXDOMAIN is excluded; DNS timeouts and SERVFAIL + intentionally still count. """ return ( command_status == "neterror" @@ -476,7 +477,7 @@ def execute_command_sequence( return if command_status != "ok": - if not _is_dns_error(command_status, error_text): + if not is_dns_error(command_status, error_text): with task_manager.threadlock: task_manager.failure_count += 1 if task_manager.failure_count > task_manager.failure_limit: diff --git a/test/test_webdriver_utils.py b/test/test_webdriver_utils.py index 87cffafa7d..191b199464 100644 --- a/test/test_webdriver_utils.py +++ b/test/test_webdriver_utils.py @@ -1,6 +1,6 @@ import pytest -from openwpm.browser_manager import _is_dns_error +from openwpm.browser_manager import is_dns_error from openwpm.commands.utils.webdriver_utils import parse_neterror from openwpm.utilities import db_utils @@ -58,29 +58,29 @@ def test_dns_error_does_not_count_against_failure_limit( assert len(dns_rows) == num_domains -def test_is_dns_error_predicate(): - """AC-5: Verify that _is_dns_error is True only for dnsNotFound neterrors. +def testis_dns_error_predicate(): + """AC-5: Verify that is_dns_error is True only for dnsNotFound neterrors. Non-DNS neterror types (connectionRefused, netOffline, etc.) and other command statuses must NOT be treated as DNS errors, so they continue to increment failure_count as before. """ # Only this exact combination qualifies - assert _is_dns_error("neterror", "dnsNotFound") is True + assert is_dns_error("neterror", "dnsNotFound") is True # Other neterror types must NOT be exempt - assert _is_dns_error("neterror", "connectionRefused") is False - assert _is_dns_error("neterror", "netOffline") is False - assert _is_dns_error("neterror", "proxyConnectFailure") is False + assert is_dns_error("neterror", "connectionRefused") is False + assert is_dns_error("neterror", "netOffline") is False + assert is_dns_error("neterror", "proxyConnectFailure") is False # Missing error_text must NOT be exempt - assert _is_dns_error("neterror", None) is False + assert is_dns_error("neterror", None) is False # Non-neterror statuses must NOT be exempt - assert _is_dns_error("ok", "dnsNotFound") is False - assert _is_dns_error("critical", "dnsNotFound") is False - assert _is_dns_error("error", "dnsNotFound") is False - assert _is_dns_error("timeout", "dnsNotFound") is False + assert is_dns_error("ok", "dnsNotFound") is False + assert is_dns_error("critical", "dnsNotFound") is False + assert is_dns_error("error", "dnsNotFound") is False + assert is_dns_error("timeout", "dnsNotFound") is False # parse_neterror returns the right code for a DNS failure message dns_msg = ( @@ -90,7 +90,7 @@ def test_is_dns_error_predicate(): "f=regular&d=We+can%27t+connect" ) assert parse_neterror(dns_msg) == "dnsNotFound" - assert _is_dns_error("neterror", parse_neterror(dns_msg)) is True + assert is_dns_error("neterror", parse_neterror(dns_msg)) is True # A different neterror code does NOT trigger the exemption conn_msg = ( @@ -100,4 +100,4 @@ def test_is_dns_error_predicate(): "f=regular&d=refused." ) assert parse_neterror(conn_msg) == "connectionRefused" - assert _is_dns_error("neterror", parse_neterror(conn_msg)) is False + assert is_dns_error("neterror", parse_neterror(conn_msg)) is False From 5c93f9916a8f4fd0a28250fc58aba9b61e0e4b62 Mon Sep 17 00:00:00 2001 From: vringar Date: Wed, 15 Apr 2026 09:01:14 +0000 Subject: [PATCH 3/5] fix(test): restore underscore in test_is_dns_error_predicate name --- openwpm/browser_manager.py | 2 +- test/test_webdriver_utils.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index 31eacf7076..cac3f16e0e 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -48,7 +48,7 @@ def is_dns_error(command_status: str, error_text: Optional[str]) -> bool: DNS resolution errors are expected when crawling large domain lists (e.g. Tranco top-100k) and don't indicate a browser or instrumentation - failure. Only NXDOMAIN is excluded; DNS timeouts and SERVFAIL + failure. Only NXDOMAIN is excluded; DNS timeouts and SERVFAIL intentionally still count. """ return ( diff --git a/test/test_webdriver_utils.py b/test/test_webdriver_utils.py index 191b199464..8a0220dc2e 100644 --- a/test/test_webdriver_utils.py +++ b/test/test_webdriver_utils.py @@ -58,7 +58,7 @@ def test_dns_error_does_not_count_against_failure_limit( assert len(dns_rows) == num_domains -def testis_dns_error_predicate(): +def test_is_dns_error_predicate(): """AC-5: Verify that is_dns_error is True only for dnsNotFound neterrors. Non-DNS neterror types (connectionRefused, netOffline, etc.) and other From e0bb6775d77d792f5814bbf970242bddb80e1308 Mon Sep 17 00:00:00 2001 From: vringar Date: Sun, 19 Apr 2026 21:28:36 +0000 Subject: [PATCH 4/5] docs(test): remove internal AC references from docstrings --- test/test_webdriver_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/test_webdriver_utils.py b/test/test_webdriver_utils.py index 8a0220dc2e..9d04fd3d63 100644 --- a/test/test_webdriver_utils.py +++ b/test/test_webdriver_utils.py @@ -35,8 +35,8 @@ def test_parse_neterror_integration(default_params, task_manager_creator): def test_dns_error_does_not_count_against_failure_limit( default_params, task_manager_creator ): - """AC-4: 100+ DNS errors must not trigger CommandExecutionError even with - a low failure_limit. Each navigation to a non-existent domain produces a + """100+ DNS errors must not trigger CommandExecutionError even with a low + failure_limit. Each navigation to a non-existent domain produces a dnsNotFound neterror that should be excluded from the failure counter.""" manager_params, browser_params = default_params manager_params.num_browsers = 1 @@ -59,7 +59,7 @@ def test_dns_error_does_not_count_against_failure_limit( def test_is_dns_error_predicate(): - """AC-5: Verify that is_dns_error is True only for dnsNotFound neterrors. + """Verify that is_dns_error is True only for dnsNotFound neterrors. Non-DNS neterror types (connectionRefused, netOffline, etc.) and other command statuses must NOT be treated as DNS errors, so they continue to From 9228048c08c6035192e0b0a1db7f539199f5a95a Mon Sep 17 00:00:00 2001 From: vringar Date: Thu, 7 May 2026 11:30:20 +0000 Subject: [PATCH 5/5] fix(browser): lock failure count check and reduce DNS test cost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Move failure_count threshold check inside threadlock so increment and compare are atomic across browser threads. - Reduce test domain count from 110 to failure_limit*3 and wrap the navigation loop in try/finally for cleanup on failure. Browser restart on DNS errors is intentional — appropriately cautious for our use case. --- openwpm/browser_manager.py | 25 ++++++++++++++----------- test/test_webdriver_utils.py | 19 +++++++++++-------- 2 files changed, 25 insertions(+), 19 deletions(-) diff --git a/openwpm/browser_manager.py b/openwpm/browser_manager.py index cac3f16e0e..ffea008e71 100644 --- a/openwpm/browser_manager.py +++ b/openwpm/browser_manager.py @@ -480,17 +480,20 @@ def execute_command_sequence( if not is_dns_error(command_status, error_text): with task_manager.threadlock: task_manager.failure_count += 1 - if task_manager.failure_count > task_manager.failure_limit: - self.logger.critical( - "BROWSER %i: Command execution failure pushes failure " - "count above the allowable limit. Setting " - "failure_status." % self.browser_id - ) - task_manager.failure_status = { - "ErrorType": "ExceedCommandFailureLimit", - "CommandSequence": command_sequence, - } - return + exceeded_limit = ( + task_manager.failure_count > task_manager.failure_limit + ) + if exceeded_limit: + self.logger.critical( + "BROWSER %i: Command execution failure pushes failure " + "count above the allowable limit. Setting " + "failure_status." % self.browser_id + ) + task_manager.failure_status = { + "ErrorType": "ExceedCommandFailureLimit", + "CommandSequence": command_sequence, + } + return self.restart_required = True self.logger.debug( "BROWSER %i: Browser restart required" % self.browser_id diff --git a/test/test_webdriver_utils.py b/test/test_webdriver_utils.py index 9d04fd3d63..d9b22a3770 100644 --- a/test/test_webdriver_utils.py +++ b/test/test_webdriver_utils.py @@ -35,19 +35,22 @@ def test_parse_neterror_integration(default_params, task_manager_creator): def test_dns_error_does_not_count_against_failure_limit( default_params, task_manager_creator ): - """100+ DNS errors must not trigger CommandExecutionError even with a low - failure_limit. Each navigation to a non-existent domain produces a - dnsNotFound neterror that should be excluded from the failure counter.""" + """DNS errors past failure_limit must not trigger CommandExecutionError. + + Each navigation to a non-existent domain produces a dnsNotFound neterror + that should be excluded from the failure counter. + """ manager_params, browser_params = default_params manager_params.num_browsers = 1 manager_params.failure_limit = 5 manager, db = task_manager_creator((manager_params, browser_params[:1])) - num_domains = 110 - for i in range(num_domains): - manager.get(f"http://domain-{i}.invalid") - - manager.close() + num_domains = manager_params.failure_limit * 3 + try: + for i in range(num_domains): + manager.get(f"http://domain-{i}.invalid") + finally: + manager.close() rows = db_utils.query_db( db,