Skip to content

Commit b380fbd

Browse files
committed
refactor(adagents): address review follow-ups on #750
- Cite adcp#4827 in _resolve_publisher_property_selectors docstring. - isinstance(list) guard on property_tags / property_ids in both publisher_properties resolution and the pre-existing property_tags / property_ids authorization branches — string-iterating-as-tags is now caught and resolves to []. - Lift domain index build out of per-agent loop; _build_domain_index module helper used by both get_all_properties and get_properties_by_agent, index passed through _resolve_agent_properties → _resolve_publisher_ property_selectors. O(N+M) instead of O(agents × N). - Document "no federated fallback" on get_properties_by_agent and get_all_properties docstrings; point at PR #752's federated helpers. - Cafemedia scale test wall-clock budget 2.0s → 5.0s for shared-runner margin. Bot review: #750
1 parent 6124465 commit b380fbd

2 files changed

Lines changed: 227 additions & 24 deletions

File tree

src/adcp/adagents.py

Lines changed: 83 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1207,12 +1207,16 @@ async def verify_agent_for_property(
12071207
def _resolve_agent_properties(
12081208
agent: dict[str, Any],
12091209
top_level_properties: list[dict[str, Any]],
1210+
domain_index: dict[str, list[dict[str, Any]]],
12101211
) -> list[dict[str, Any]]:
12111212
"""Resolve properties for a single agent entry based on its authorization_type.
12121213
12131214
Args:
12141215
agent: An authorized_agents entry
12151216
top_level_properties: The top-level properties array from adagents.json
1217+
domain_index: Pre-built ``publisher_domain → [property, ...]`` index
1218+
over ``top_level_properties`` (built once per file by the caller
1219+
via :func:`_build_domain_index`).
12161220
12171221
Returns:
12181222
List of resolved property dicts for this agent
@@ -1230,7 +1234,10 @@ def _resolve_agent_properties(
12301234

12311235
# Handle property_ids (filter top-level properties by property_id)
12321236
if authorization_type == "property_ids":
1233-
authorized_ids = set(agent.get("property_ids", []))
1237+
raw_ids = agent.get("property_ids")
1238+
if not isinstance(raw_ids, list):
1239+
return []
1240+
authorized_ids = {i for i in raw_ids if isinstance(i, str)}
12341241
return [
12351242
p
12361243
for p in top_level_properties
@@ -1239,7 +1246,10 @@ def _resolve_agent_properties(
12391246

12401247
# Handle property_tags (filter top-level properties by tags)
12411248
if authorization_type == "property_tags":
1242-
authorized_tags = {t for t in agent.get("property_tags", []) if isinstance(t, str)}
1249+
raw_tags = agent.get("property_tags")
1250+
if not isinstance(raw_tags, list):
1251+
return []
1252+
authorized_tags = {t for t in raw_tags if isinstance(t, str)}
12431253
return [
12441254
p
12451255
for p in top_level_properties
@@ -1260,20 +1270,45 @@ def _resolve_agent_properties(
12601270
selectors = _fanout_publisher_properties(
12611271
[p for p in publisher_props if isinstance(p, dict)]
12621272
)
1263-
return _resolve_publisher_property_selectors(selectors, top_level_properties)
1273+
return _resolve_publisher_property_selectors(selectors, domain_index)
12641274

12651275
return []
12661276

12671277

1278+
def _build_domain_index(
1279+
properties: list[dict[str, Any]],
1280+
) -> dict[str, list[dict[str, Any]]]:
1281+
"""Build a ``publisher_domain → [property, ...]`` index.
1282+
1283+
O(N) up-front cost; reused across every selector for that file so the
1284+
per-selector resolution cost drops from O(properties) to O(1) lookup
1285+
plus O(matches) filtering. Malformed entries (non-dict, missing or
1286+
non-string ``publisher_domain``) are skipped.
1287+
"""
1288+
domain_index: dict[str, list[dict[str, Any]]] = {}
1289+
for prop in properties:
1290+
if not isinstance(prop, dict):
1291+
continue
1292+
domain = prop.get("publisher_domain")
1293+
if not isinstance(domain, str) or not domain:
1294+
continue
1295+
domain_index.setdefault(domain, []).append(prop)
1296+
return domain_index
1297+
1298+
12681299
def _resolve_publisher_property_selectors(
12691300
selectors: list[dict[str, Any]],
1270-
top_level_properties: list[dict[str, Any]],
1301+
domain_index: dict[str, list[dict[str, Any]]],
12711302
) -> list[dict[str, Any]]:
12721303
"""Resolve fanned-out publisher_properties selectors against inline data.
12731304
1305+
Resolves selectors per adcp#4827 §Resolution-paths (inline path
1306+
against the parent file's top-level properties[] indexed by
1307+
publisher_domain).
1308+
12741309
For each selector (one per publisher_domain), look up the matching
1275-
properties in ``top_level_properties`` by ``publisher_domain`` and
1276-
apply the selector's ``selection_type``:
1310+
properties in ``domain_index`` by ``publisher_domain`` and apply the
1311+
selector's ``selection_type``:
12771312
12781313
- ``"all"``: every property under that domain
12791314
- ``"by_tag"``: properties whose ``tags`` intersect ``property_tags``
@@ -1289,25 +1324,16 @@ def _resolve_publisher_property_selectors(
12891324
federated fallback (fetching the publisher's own adagents.json) is
12901325
out of scope for this resolver and lives in companion helpers.
12911326
1327+
``domain_index`` is built once per file by :func:`_build_domain_index`
1328+
and reused across every agent's selectors in that file.
1329+
12921330
Results are deduplicated by ``(publisher_domain, property_id)``.
12931331
Raises :class:`AdagentsValidationError` if any matching property is
12941332
missing the required ``property_id`` field (fail-fast per CLAUDE.md).
12951333
"""
12961334
if not selectors:
12971335
return []
12981336

1299-
# Build domain → [property, ...] index once. O(N) up-front trades
1300-
# against O(selectors × properties) per-domain scans, which blows
1301-
# up at cafemedia scale (6,843 properties × thousands of selectors).
1302-
domain_index: dict[str, list[dict[str, Any]]] = {}
1303-
for prop in top_level_properties:
1304-
if not isinstance(prop, dict):
1305-
continue
1306-
domain = prop.get("publisher_domain")
1307-
if not isinstance(domain, str) or not domain:
1308-
continue
1309-
domain_index.setdefault(domain, []).append(prop)
1310-
13111337
resolved: list[dict[str, Any]] = []
13121338
seen: set[tuple[str, str]] = set()
13131339
for selector in selectors:
@@ -1323,7 +1349,10 @@ def _resolve_publisher_property_selectors(
13231349
if selection_type == "all":
13241350
matched = list(candidates)
13251351
elif selection_type == "by_tag":
1326-
wanted_tags = {t for t in selector.get("property_tags", []) or [] if isinstance(t, str)}
1352+
raw_tags = selector.get("property_tags")
1353+
if not isinstance(raw_tags, list):
1354+
continue
1355+
wanted_tags = {t for t in raw_tags if isinstance(t, str)}
13271356
if not wanted_tags:
13281357
continue
13291358
matched = [
@@ -1332,7 +1361,10 @@ def _resolve_publisher_property_selectors(
13321361
if {t for t in p.get("tags", []) or [] if isinstance(t, str)} & wanted_tags
13331362
]
13341363
elif selection_type == "by_id":
1335-
wanted_ids = {i for i in selector.get("property_ids", []) or [] if isinstance(i, str)}
1364+
raw_ids = selector.get("property_ids")
1365+
if not isinstance(raw_ids, list):
1366+
continue
1367+
wanted_ids = {i for i in raw_ids if isinstance(i, str)}
13361368
if not wanted_ids:
13371369
continue
13381370
matched = [p for p in candidates if p.get("property_id") in wanted_ids]
@@ -1433,6 +1465,17 @@ def get_all_properties(adagents_data: dict[str, Any]) -> list[dict[str, Any]]:
14331465
Handles all authorization types: inline_properties, property_ids,
14341466
property_tags, and publisher_properties.
14351467
1468+
For ``publisher_properties`` selectors whose target ``publisher_domain``
1469+
is NOT present inline in this file's top-level ``properties[]`` array,
1470+
this function returns no properties for that selector. Federated
1471+
fallback (fetching the child publisher's own adagents.json to resolve
1472+
the selector remotely) is out of scope here and lives in
1473+
:func:`fetch_agent_authorizations_from_directory` and
1474+
:func:`detect_publisher_properties_divergence` from companion PR #752.
1475+
Wire-only authorization checks that assume federated resolution will
1476+
under-authorize against managed-network parent files that only inline
1477+
a subset of their child domains.
1478+
14361479
Args:
14371480
adagents_data: Parsed adagents.json data
14381481
@@ -1464,6 +1507,11 @@ def get_all_properties(adagents_data: dict[str, Any]) -> list[dict[str, Any]]:
14641507
)
14651508
]
14661509

1510+
# Build the domain index once per file — _resolve_agent_properties is
1511+
# called per-agent, and at cafemedia scale (thousands of properties ×
1512+
# multiple agents) rebuilding it inside each call is O(agents × N).
1513+
domain_index = _build_domain_index(revoked_top_level)
1514+
14671515
properties = []
14681516
for agent in authorized_agents:
14691517
if not isinstance(agent, dict):
@@ -1475,7 +1523,7 @@ def get_all_properties(adagents_data: dict[str, Any]) -> list[dict[str, Any]]:
14751523

14761524
# revoked_top_level pre-filters revoked domains from the per-domain
14771525
# index, so inline resolution honors revocation transparently.
1478-
agent_properties = _resolve_agent_properties(agent, revoked_top_level)
1526+
agent_properties = _resolve_agent_properties(agent, revoked_top_level, domain_index)
14791527

14801528
for prop in agent_properties:
14811529
prop_with_agent = {**prop, "agent_url": agent_url}
@@ -1520,6 +1568,17 @@ def get_properties_by_agent(adagents_data: dict[str, Any], agent_url: str) -> li
15201568
selectors (resolved from the parent file's top-level properties[]
15211569
array per adcp#4827)
15221570
1571+
For ``publisher_properties`` selectors whose target ``publisher_domain``
1572+
is NOT present inline in this file's top-level ``properties[]`` array,
1573+
this function returns no properties for that selector. Federated
1574+
fallback (fetching the child publisher's own adagents.json to resolve
1575+
the selector remotely) is out of scope here and lives in
1576+
:func:`fetch_agent_authorizations_from_directory` and
1577+
:func:`detect_publisher_properties_divergence` from companion PR #752.
1578+
Wire-only authorization checks that assume federated resolution will
1579+
under-authorize against managed-network parent files that only inline
1580+
a subset of their child domains.
1581+
15231582
Args:
15241583
adagents_data: Parsed adagents.json data
15251584
agent_url: URL of the agent to filter by
@@ -1554,6 +1613,8 @@ def get_properties_by_agent(adagents_data: dict[str, Any], agent_url: str) -> li
15541613

15551614
normalized_agent_url = normalize_url(agent_url)
15561615

1616+
domain_index = _build_domain_index(revoked_top_level)
1617+
15571618
for agent in authorized_agents:
15581619
if not isinstance(agent, dict):
15591620
continue
@@ -1567,7 +1628,7 @@ def get_properties_by_agent(adagents_data: dict[str, Any], agent_url: str) -> li
15671628

15681629
# revoked_top_level pre-filters revoked domains from the per-domain
15691630
# index, so inline resolution honors revocation transparently.
1570-
resolved = _resolve_agent_properties(agent, revoked_top_level)
1631+
resolved = _resolve_agent_properties(agent, revoked_top_level, domain_index)
15711632
return resolved
15721633

15731634
return []

tests/test_adagents.py

Lines changed: 144 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1759,7 +1759,7 @@ def test_property_missing_property_id_raises(self):
17591759
get_properties_by_agent(adagents_data, "https://agent1.example.com")
17601760

17611761
def test_get_properties_by_agent_cafemedia_scale(self):
1762-
"""6,843 inline properties across 6,800 child domains under one publisher_domains[] selector.
1762+
"""6,843 inline properties across 6,800 child domains under one publisher_domains[].
17631763
17641764
Wall-clock-bounded to catch O(N×M) regressions in the resolver.
17651765
At this scale a naive per-domain linear scan over the property list
@@ -1815,10 +1815,152 @@ def test_get_properties_by_agent_cafemedia_scale(self):
18151815
result = get_properties_by_agent(adagents_data, "https://agent1.example.com")
18161816
elapsed = time.perf_counter() - start
18171817

1818-
assert elapsed < 2.0, f"resolution took {elapsed:.2f}s (>= 2.0s budget)"
1818+
assert elapsed < 5.0, f"resolution took {elapsed:.2f}s (>= 5.0s budget)"
18191819
assert len(result) == 6843
18201820
assert {p["publisher_domain"] for p in result} == set(child_domains)
18211821

1822+
def test_malformed_property_tags_value_resolves_empty(self):
1823+
"""publisher_properties selector with property_tags as a STRING resolves to [].
1824+
1825+
Without the isinstance(list) guard, ``property_tags: "ctv"`` iterates
1826+
char-by-char and matches properties tagged ``"c"``/``"t"``/``"v"``.
1827+
The resolver must fail-closed on malformed input.
1828+
"""
1829+
adagents_data = {
1830+
"properties": [
1831+
{
1832+
"property_id": "p1",
1833+
"publisher_domain": "site1.example",
1834+
"property_type": "website",
1835+
"name": "Site 1",
1836+
"identifiers": [{"type": "domain", "value": "site1.example"}],
1837+
"tags": ["c"],
1838+
},
1839+
{
1840+
"property_id": "p2",
1841+
"publisher_domain": "site1.example",
1842+
"property_type": "website",
1843+
"name": "Site 2",
1844+
"identifiers": [{"type": "domain", "value": "site2.example"}],
1845+
"tags": ["t"],
1846+
},
1847+
{
1848+
"property_id": "p3",
1849+
"publisher_domain": "site1.example",
1850+
"property_type": "website",
1851+
"name": "Site 3",
1852+
"identifiers": [{"type": "domain", "value": "site3.example"}],
1853+
"tags": ["v"],
1854+
},
1855+
],
1856+
"authorized_agents": [
1857+
{
1858+
"url": "https://agent1.example.com",
1859+
"authorization_type": "publisher_properties",
1860+
"authorized_for": "Test",
1861+
"publisher_properties": [
1862+
{
1863+
"publisher_domain": "site1.example",
1864+
"selection_type": "by_tag",
1865+
"property_tags": "ctv", # malformed: string, not list
1866+
},
1867+
],
1868+
},
1869+
],
1870+
}
1871+
1872+
result = get_properties_by_agent(adagents_data, "https://agent1.example.com")
1873+
assert result == []
1874+
1875+
def test_get_all_properties_builds_index_once(self):
1876+
"""_build_domain_index runs once per file, not once per agent.
1877+
1878+
At N agents × M properties, rebuilding the index inside
1879+
_resolve_agent_properties is O(agents × M). This test patches the
1880+
helper with a counter and asserts a single invocation across a
1881+
file with multiple publisher_properties agents.
1882+
"""
1883+
from unittest.mock import patch
1884+
1885+
from adcp import adagents as adagents_module
1886+
1887+
adagents_data = {
1888+
"properties": [
1889+
{
1890+
"property_id": "p1",
1891+
"publisher_domain": "site1.example",
1892+
"property_type": "website",
1893+
"name": "Site 1",
1894+
"identifiers": [{"type": "domain", "value": "site1.example"}],
1895+
"tags": ["managed"],
1896+
},
1897+
{
1898+
"property_id": "p2",
1899+
"publisher_domain": "site2.example",
1900+
"property_type": "website",
1901+
"name": "Site 2",
1902+
"identifiers": [{"type": "domain", "value": "site2.example"}],
1903+
"tags": ["managed"],
1904+
},
1905+
],
1906+
"authorized_agents": [
1907+
{
1908+
"url": "https://agent1.example.com",
1909+
"authorization_type": "publisher_properties",
1910+
"authorized_for": "A",
1911+
"publisher_properties": [
1912+
{
1913+
"publisher_domain": "site1.example",
1914+
"selection_type": "all",
1915+
},
1916+
],
1917+
},
1918+
{
1919+
"url": "https://agent2.example.com",
1920+
"authorization_type": "publisher_properties",
1921+
"authorized_for": "B",
1922+
"publisher_properties": [
1923+
{
1924+
"publisher_domain": "site2.example",
1925+
"selection_type": "all",
1926+
},
1927+
],
1928+
},
1929+
{
1930+
"url": "https://agent3.example.com",
1931+
"authorization_type": "publisher_properties",
1932+
"authorized_for": "C",
1933+
"publisher_properties": [
1934+
{
1935+
"publisher_domain": "site1.example",
1936+
"selection_type": "by_tag",
1937+
"property_tags": ["managed"],
1938+
},
1939+
],
1940+
},
1941+
],
1942+
}
1943+
1944+
original = adagents_module._build_domain_index
1945+
with patch.object(
1946+
adagents_module,
1947+
"_build_domain_index",
1948+
side_effect=original,
1949+
) as spy:
1950+
result = get_all_properties(adagents_data)
1951+
1952+
assert spy.call_count == 1, (
1953+
f"_build_domain_index called {spy.call_count} times; "
1954+
"expected exactly once per get_all_properties invocation"
1955+
)
1956+
# Sanity: index actually reused — all three agents resolved.
1957+
agent_urls = {p["agent_url"] for p in result}
1958+
assert agent_urls == {
1959+
"https://agent1.example.com",
1960+
"https://agent2.example.com",
1961+
"https://agent3.example.com",
1962+
}
1963+
18221964
def test_get_properties_by_agent_protocol_agnostic(self):
18231965
"""Should match agent URL regardless of protocol."""
18241966
adagents_data = {

0 commit comments

Comments
 (0)