diff --git a/libs/openant-core/core/parser_adapter.py b/libs/openant-core/core/parser_adapter.py index ef2f845..cf221ad 100644 --- a/libs/openant-core/core/parser_adapter.py +++ b/libs/openant-core/core/parser_adapter.py @@ -80,6 +80,7 @@ def parse_repository( name: str = None, diff_manifest: str | None = None, fresh: bool = False, + library_mode: bool = False, ) -> ParseResult: """Parse a repository into an OpenAnt dataset. @@ -96,6 +97,8 @@ def parse_repository( fresh: If True, delete existing dataset.json before parsing so all units are regenerated from scratch. Only dataset.json is deleted; other artifacts in output_dir (e.g. analyzer outputs) are preserved. + library_mode: If True, seed the public API surface as reachability + entry points (opt-in, union-only). Returns: ParseResult with paths to generated files and stats. @@ -127,19 +130,19 @@ def parse_repository( # Dispatch to the right parser if language == "python": - result = _parse_python(repo_path, output_dir, processing_level, skip_tests, name) + result = _parse_python(repo_path, output_dir, processing_level, skip_tests, name, library_mode) elif language == "javascript": - result = _parse_javascript(repo_path, output_dir, processing_level, skip_tests, name) + result = _parse_javascript(repo_path, output_dir, processing_level, skip_tests, name, library_mode) elif language == "go": - result = _parse_go(repo_path, output_dir, processing_level, skip_tests, name) + result = _parse_go(repo_path, output_dir, processing_level, skip_tests, name, library_mode) elif language == "c": - result = _parse_c(repo_path, output_dir, processing_level, skip_tests, name) + result = _parse_c(repo_path, output_dir, processing_level, skip_tests, name, library_mode) elif language == "ruby": - result = _parse_ruby(repo_path, output_dir, processing_level, skip_tests, name) + result = _parse_ruby(repo_path, output_dir, processing_level, skip_tests, name, library_mode) elif language == "php": - result = _parse_php(repo_path, output_dir, processing_level, skip_tests, name) + result = _parse_php(repo_path, output_dir, processing_level, skip_tests, name, library_mode) elif language == "zig": - result = _parse_zig(repo_path, output_dir, processing_level, skip_tests, name) + result = _parse_zig(repo_path, output_dir, processing_level, skip_tests, name, library_mode) else: raise ValueError(f"Unsupported language: {language}") @@ -207,11 +210,18 @@ def _maybe_apply_diff_filter( # Reachability filter (shared by Python path; JS/Go handle it internally) # --------------------------------------------------------------------------- +# library_seed_ids is now shared in utilities/agentic_enhancer/entry_point_detector.py +# so every parser pipeline (not just Python) can seed the public API. It is loaded +# below via the same importlib path as EntryPointDetector to dodge the heavy +# utilities/__init__ imports. + + def apply_reachability_filter( dataset: dict, output_dir: str, processing_level: str, extra_entry_points: "set[str] | None" = None, + library_mode: bool = False, ) -> dict: """Filter dataset units to only those reachable from entry points. @@ -254,6 +264,8 @@ def _load_module(name, filename): _epd = _load_module("entry_point_detector", "entry_point_detector.py") _ra = _load_module("reachability_analyzer", "reachability_analyzer.py") EntryPointDetector = _epd.EntryPointDetector + blackout_warning = _epd.blackout_warning + library_seed_ids = _epd.library_seed_ids ReachabilityAnalyzer = _ra.ReachabilityAnalyzer call_graph_path = os.path.join(output_dir, "call_graph.json") @@ -277,6 +289,11 @@ def _load_module(name, filename): entry_points = detector.detect_entry_points() if extra_entry_points: entry_points = entry_points | extra_entry_points + # Library-mode (opt-in): the public API is the entry surface. Union-only — + # never demotes a structurally-detected app entry point, so an app scan with + # the flag on can only gain reachable units, never lose one. + if library_mode: + entry_points = entry_points | library_seed_ids(functions) units = dataset.get("units", []) original_count = len(units) @@ -349,6 +366,12 @@ def _load_module(name, filename): file=sys.stderr, ) + _blackout = blackout_warning(detector.entry_point_details, original_count, + len(filtered_units), library_mode=library_mode) + if _blackout: + dataset["metadata"]["reachability_filter"]["warning"] = _blackout + print(f" [Warning] {_blackout}", file=sys.stderr) + # Warn about unimplemented higher-level filters if processing_level == "codeql": print( @@ -374,7 +397,7 @@ def _load_module(name, filename): # Python parser # --------------------------------------------------------------------------- -def _parse_python(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult: +def _parse_python(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult: """Invoke the Python parser. The Python parser has a clean `parse_repository()` function that we can @@ -402,7 +425,8 @@ def _parse_python(repo_path: str, output_dir: str, processing_level: str, skip_t # Apply reachability filter if processing_level requires it if processing_level != "all": - dataset = _apply_reachability_filter(dataset, output_dir, processing_level) + dataset = _apply_reachability_filter(dataset, output_dir, processing_level, + library_mode=library_mode) # Write outputs write_json(dataset_path, dataset) @@ -523,7 +547,7 @@ def _file_lock(lock_path: Path): f.close() -def _parse_javascript(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult: +def _parse_javascript(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult: """Invoke the JavaScript/TypeScript parser. The JS parser is a PipelineTest class that runs Node.js subprocesses. @@ -547,6 +571,8 @@ def _parse_javascript(repo_path: str, output_dir: str, processing_level: str, sk cmd.extend(["--name", name]) if skip_tests: cmd.append("--skip-tests") + if library_mode: + cmd.append("--library-mode") result = subprocess.run( cmd, @@ -582,7 +608,7 @@ def _parse_javascript(repo_path: str, output_dir: str, processing_level: str, sk # Go parser # --------------------------------------------------------------------------- -def _parse_go(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult: +def _parse_go(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult: """Invoke the Go parser. The Go parser is a PipelineTest class that calls a compiled Go binary. @@ -603,6 +629,8 @@ def _parse_go(repo_path: str, output_dir: str, processing_level: str, skip_tests cmd.extend(["--name", name]) if skip_tests: cmd.append("--skip-tests") + if library_mode: + cmd.append("--library-mode") result = subprocess.run( cmd, @@ -638,7 +666,7 @@ def _parse_go(repo_path: str, output_dir: str, processing_level: str, skip_tests # C/C++ parser # --------------------------------------------------------------------------- -def _parse_c(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult: +def _parse_c(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult: """Invoke the C/C++ parser. The C parser uses tree-sitter for function extraction and call graph @@ -661,6 +689,8 @@ def _parse_c(repo_path: str, output_dir: str, processing_level: str, skip_tests: cmd.extend(["--name", name]) if skip_tests: cmd.append("--skip-tests") + if library_mode: + cmd.append("--library-mode") result = subprocess.run( cmd, @@ -697,7 +727,7 @@ def _parse_c(repo_path: str, output_dir: str, processing_level: str, skip_tests: # Ruby parser # --------------------------------------------------------------------------- -def _parse_ruby(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult: +def _parse_ruby(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult: """Invoke the Ruby parser. The Ruby parser uses tree-sitter for function extraction and call graph @@ -720,6 +750,8 @@ def _parse_ruby(repo_path: str, output_dir: str, processing_level: str, skip_tes cmd.extend(["--name", name]) if skip_tests: cmd.append("--skip-tests") + if library_mode: + cmd.append("--library-mode") result = subprocess.run( cmd, @@ -756,7 +788,7 @@ def _parse_ruby(repo_path: str, output_dir: str, processing_level: str, skip_tes # PHP parser # --------------------------------------------------------------------------- -def _parse_php(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult: +def _parse_php(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult: """Invoke the PHP parser. The PHP parser uses tree-sitter for function extraction and call graph @@ -779,6 +811,8 @@ def _parse_php(repo_path: str, output_dir: str, processing_level: str, skip_test cmd.extend(["--name", name]) if skip_tests: cmd.append("--skip-tests") + if library_mode: + cmd.append("--library-mode") result = subprocess.run( cmd, @@ -815,7 +849,7 @@ def _parse_php(repo_path: str, output_dir: str, processing_level: str, skip_test # Zig parser # --------------------------------------------------------------------------- -def _parse_zig(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult: +def _parse_zig(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult: """Invoke the Zig parser. The Zig parser uses tree-sitter for function extraction and call graph @@ -838,6 +872,8 @@ def _parse_zig(repo_path: str, output_dir: str, processing_level: str, skip_test cmd.extend(["--name", name]) if skip_tests: cmd.append("--skip-tests") + if library_mode: + cmd.append("--library-mode") result = subprocess.run( cmd, diff --git a/libs/openant-core/core/scanner.py b/libs/openant-core/core/scanner.py index b17a98c..4f6d476 100644 --- a/libs/openant-core/core/scanner.py +++ b/libs/openant-core/core/scanner.py @@ -62,6 +62,7 @@ def scan_repository( diff_manifest: str | None = None, llm_reachability: bool = False, llm_reachability_max_code_bytes: int = 1500, + library_mode: bool = False, ) -> ScanResult: """Scan a repository for vulnerabilities. @@ -171,6 +172,7 @@ def _step_label(name: str) -> str: processing_level=effective_parse_level, skip_tests=skip_tests, diff_manifest=diff_manifest, + library_mode=library_mode, ) ctx.summary = { diff --git a/libs/openant-core/openant/cli.py b/libs/openant-core/openant/cli.py index b082d68..1f25d7e 100644 --- a/libs/openant-core/openant/cli.py +++ b/libs/openant-core/openant/cli.py @@ -75,6 +75,7 @@ def cmd_scan(args): repo_url=getattr(args, "repo_url", None), commit_sha=getattr(args, "commit_sha", None), diff_manifest=getattr(args, "diff_manifest", None), + library_mode=getattr(args, "library_mode", False), llm_reachability=getattr(args, "llm_reachability", False), llm_reachability_max_code_bytes=getattr( args, "llm_reachability_max_code_bytes", 1500 @@ -129,6 +130,7 @@ def cmd_parse(args): name=getattr(args, "name", None), diff_manifest=getattr(args, "diff_manifest", None), fresh=getattr(args, "fresh", False), + library_mode=getattr(args, "library_mode", False), ) ctx.summary = { @@ -995,6 +997,8 @@ def main(): scan_p.add_argument("--dynamic-test", action="store_true", help="Enable Docker-isolated dynamic testing (off by default)") scan_p.add_argument("--no-skip-tests", action="store_true", help="Include test files in parsing (default: tests are skipped)") + scan_p.add_argument("--library-mode", action="store_true", + help="Seed the exported public API as entry points (for libraries with no main/route/CLI entry point)") scan_p.add_argument("--limit", type=int, help="Max units to analyze") scan_p.add_argument( "--llm-config", @@ -1058,6 +1062,8 @@ def main(): help="Processing level (default: reachable)", ) parse_p.add_argument("--no-skip-tests", action="store_true", help="Include test files in parsing (default: tests are skipped)") + parse_p.add_argument("--library-mode", action="store_true", + help="Seed the exported public API as entry points (for libraries with no main/route/CLI entry point)") parse_p.add_argument("--name", help="Dataset name (default: derived from repo path)") parse_p.add_argument("--diff-manifest", help="Path to diff_manifest.json; tags units with diff_selected") parse_p.add_argument("--fresh", action="store_true", diff --git a/libs/openant-core/parsers/c/test_pipeline.py b/libs/openant-core/parsers/c/test_pipeline.py index c19824d..883b8bf 100644 --- a/libs/openant-core/parsers/c/test_pipeline.py +++ b/libs/openant-core/parsers/c/test_pipeline.py @@ -48,7 +48,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from utilities.file_io import open_utf8, read_json, run_utf8, write_json from utilities.context_enhancer import ContextEnhancer -from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer +from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer, blackout_warning, library_seed_ids # Local imports from repository_scanner import RepositoryScanner @@ -87,7 +87,8 @@ def __init__( processing_level: ProcessingLevel = ProcessingLevel.ALL, skip_tests: bool = False, depth: int = 3, - name: str = None + name: str = None, + library_mode: bool = False ): self.repo_path = os.path.abspath(repo_path) self.output_dir = output_dir or os.path.join(os.path.dirname(__file__), 'test_output') @@ -98,6 +99,7 @@ def __init__( self.skip_tests = skip_tests self.depth = depth self.dataset_name = name + self.library_mode = library_mode # Pipeline artifacts self.scan_results_file = None @@ -292,6 +294,13 @@ def apply_reachability_filter(self) -> bool: detector = EntryPointDetector(normalized_functions, call_graph) self.entry_points = detector.detect_entry_points() + # Library-mode: a library's entry surface is its exported public API, + # which carries no main/route/CLI marker. Seed it so the BFS reaches + # the core instead of blacking out. Union-only — never drops a + # structurally-detected entry point. + if self.library_mode: + self.entry_points = self.entry_points | library_seed_ids(normalized_functions) + # Build reachability reachability = ReachabilityAnalyzer( functions=normalized_functions, @@ -323,6 +332,13 @@ def apply_reachability_filter(self) -> bool: "reduction_percentage": round((1 - len(filtered_units) / original_count) * 100, 1) if original_count > 0 else 0 } + _blackout = blackout_warning(detector.entry_point_details, original_count, + len(filtered_units), + library_mode=getattr(self, "library_mode", False)) + if _blackout: + dataset["metadata"]["reachability_filter"]["warning"] = _blackout + print(f" [Warning] {_blackout}", file=sys.stderr) + write_json(self.dataset_file, dataset) elapsed = (datetime.now() - start_time).total_seconds() @@ -1027,6 +1043,11 @@ def main(): default=None, help='Dataset name (default: derived from repo path)' ) + parser.add_argument( + '--library-mode', + action='store_true', + help='Seed the exported public API as entry points (for libraries with no main/route/CLI)' + ) args = parser.parse_args() @@ -1048,7 +1069,8 @@ def main(): processing_level=processing_level, skip_tests=args.skip_tests, depth=args.depth, - name=args.name + name=args.name, + library_mode=args.library_mode ) results = pipeline.run_full_pipeline() diff --git a/libs/openant-core/parsers/go/test_pipeline.py b/libs/openant-core/parsers/go/test_pipeline.py index bba7551..a9aa1bd 100644 --- a/libs/openant-core/parsers/go/test_pipeline.py +++ b/libs/openant-core/parsers/go/test_pipeline.py @@ -79,7 +79,7 @@ def _stdout_supports_unicode() -> bool: # Add parent directory to path for utilities import sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from utilities.context_enhancer import ContextEnhancer -from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer +from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer, blackout_warning, library_seed_ids class ProcessingLevel(Enum): @@ -110,7 +110,8 @@ def __init__( processing_level: ProcessingLevel = ProcessingLevel.ALL, skip_tests: bool = False, depth: int = 3, - name: str = None + name: str = None, + library_mode: bool = False ): self.repo_path = os.path.abspath(repo_path) self.output_dir = output_dir or os.path.join(os.path.dirname(__file__), 'test_output') @@ -121,6 +122,7 @@ def __init__( self.skip_tests = skip_tests self.depth = depth self.dataset_name = name + self.library_mode = library_mode # Go parser binary location self.go_parser = os.path.join(self.parser_dir, 'go_parser', 'go_parser') @@ -412,6 +414,11 @@ def apply_reachability_filter(self) -> bool: detector = EntryPointDetector(normalized_functions, call_graph) self.entry_points = detector.detect_entry_points() + # Library-mode: seed the exported public API (a library has no + # main/route/CLI marker). Union-only — never drops a real entry point. + if self.library_mode: + self.entry_points = self.entry_points | library_seed_ids(normalized_functions) + # Build reachability analyzer reachability = ReachabilityAnalyzer( functions=normalized_functions, @@ -445,6 +452,13 @@ def apply_reachability_filter(self) -> bool: "reduction_percentage": round((1 - len(filtered_units) / original_count) * 100, 1) if original_count > 0 else 0 } + _blackout = blackout_warning(detector.entry_point_details, original_count, + len(filtered_units), + library_mode=getattr(self, "library_mode", False)) + if _blackout: + dataset["metadata"]["reachability_filter"]["warning"] = _blackout + print(f" [Warning] {_blackout}", file=sys.stderr) + # Write filtered dataset write_json(self.dataset_file, dataset) @@ -1183,6 +1197,11 @@ def main(): default=None, help='Dataset name (default: derived from repo path)' ) + parser.add_argument( + '--library-mode', + action='store_true', + help='Seed the exported public API as entry points (for libraries with no main/route/CLI)' + ) args = parser.parse_args() @@ -1205,7 +1224,8 @@ def main(): processing_level=processing_level, skip_tests=args.skip_tests, depth=args.depth, - name=args.name + name=args.name, + library_mode=args.library_mode ) results = pipeline.run_full_pipeline() diff --git a/libs/openant-core/parsers/javascript/test_pipeline.py b/libs/openant-core/parsers/javascript/test_pipeline.py index 6cf8911..6a32d59 100644 --- a/libs/openant-core/parsers/javascript/test_pipeline.py +++ b/libs/openant-core/parsers/javascript/test_pipeline.py @@ -78,7 +78,7 @@ def _stdout_supports_unicode() -> bool: # Add parent directory to path for utilities import sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from utilities.context_enhancer import ContextEnhancer -from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer +from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer, blackout_warning, library_seed_ids class ProcessingLevel(Enum): @@ -103,7 +103,8 @@ def __init__( processing_level: ProcessingLevel = ProcessingLevel.ALL, skip_tests: bool = False, depth: int = 3, - name: str = None + name: str = None, + library_mode: bool = False ): self.repo_path = os.path.abspath(repo_path) self.output_dir = output_dir or os.path.join(os.path.dirname(__file__), 'test_output') @@ -114,6 +115,7 @@ def __init__( self.skip_tests = skip_tests self.depth = depth self.dataset_name = name + self.library_mode = library_mode # Component locations # repository_scanner.js and unit_generator.js are in this package @@ -623,6 +625,11 @@ def apply_reachability_filter(self) -> bool: detector = EntryPointDetector(functions, call_graph) self.entry_points = detector.detect_entry_points() + # Library-mode: seed the exported public API (a library has no + # main/route/CLI marker). Union-only — never drops a real entry point. + if self.library_mode: + self.entry_points = self.entry_points | library_seed_ids(functions) + # Build reachability analyzer reachability = ReachabilityAnalyzer( functions=functions, @@ -659,6 +666,13 @@ def apply_reachability_filter(self) -> bool: "reduction_percentage": round((1 - len(filtered_units) / original_count) * 100, 1) if original_count > 0 else 0 } + _blackout = blackout_warning(detector.entry_point_details, original_count, + len(filtered_units), + library_mode=getattr(self, "library_mode", False)) + if _blackout: + dataset["metadata"]["reachability_filter"]["warning"] = _blackout + print(f" [Warning] {_blackout}", file=sys.stderr) + # Write filtered dataset write_json(self.dataset_file, dataset) @@ -1318,6 +1332,7 @@ def main(): skip_tests = False depth = 3 name = None + library_mode = False else: parser = argparse.ArgumentParser( description='Test the parser pipeline on a repository', @@ -1389,6 +1404,11 @@ def main(): default=None, help='Dataset name (default: derived from repo path)' ) + parser.add_argument( + '--library-mode', + action='store_true', + help='Seed the exported public API as entry points (for libraries with no main/route/CLI)' + ) args = parser.parse_args() repo_path = args.repo_path @@ -1400,6 +1420,7 @@ def main(): skip_tests = args.skip_tests depth = args.depth name = args.name + library_mode = args.library_mode if not os.path.exists(repo_path): print(f"Error: Repository not found: {repo_path}") @@ -1423,7 +1444,8 @@ def main(): processing_level=processing_level, skip_tests=skip_tests, depth=depth, - name=name + name=name, + library_mode=library_mode ) results = pipeline.run_full_pipeline() diff --git a/libs/openant-core/parsers/php/test_pipeline.py b/libs/openant-core/parsers/php/test_pipeline.py index 9947bdd..e3f9960 100644 --- a/libs/openant-core/parsers/php/test_pipeline.py +++ b/libs/openant-core/parsers/php/test_pipeline.py @@ -48,7 +48,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from utilities.file_io import open_utf8, read_json, run_utf8, write_json from utilities.context_enhancer import ContextEnhancer -from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer +from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer, blackout_warning, library_seed_ids # Local imports from repository_scanner import RepositoryScanner @@ -78,7 +78,8 @@ def __init__( processing_level: ProcessingLevel = ProcessingLevel.ALL, skip_tests: bool = False, depth: int = 3, - name: str = None + name: str = None, + library_mode: bool = False ): self.repo_path = os.path.abspath(repo_path) self.output_dir = output_dir or os.path.join(os.path.dirname(__file__), 'test_output') @@ -89,6 +90,7 @@ def __init__( self.skip_tests = skip_tests self.depth = depth self.dataset_name = name + self.library_mode = library_mode # Pipeline artifacts self.scan_results_file = None @@ -283,6 +285,11 @@ def apply_reachability_filter(self) -> bool: detector = EntryPointDetector(normalized_functions, call_graph) self.entry_points = detector.detect_entry_points() + # Library-mode: seed the exported public API (a library has no + # main/route/CLI marker). Union-only — never drops a real entry point. + if self.library_mode: + self.entry_points = self.entry_points | library_seed_ids(normalized_functions) + # Build reachability reachability = ReachabilityAnalyzer( functions=normalized_functions, @@ -314,6 +321,13 @@ def apply_reachability_filter(self) -> bool: "reduction_percentage": round((1 - len(filtered_units) / original_count) * 100, 1) if original_count > 0 else 0 } + _blackout = blackout_warning(detector.entry_point_details, original_count, + len(filtered_units), + library_mode=getattr(self, "library_mode", False)) + if _blackout: + dataset["metadata"]["reachability_filter"]["warning"] = _blackout + print(f" [Warning] {_blackout}", file=sys.stderr) + write_json(self.dataset_file, dataset) elapsed = (datetime.now() - start_time).total_seconds() @@ -1003,6 +1017,11 @@ def main(): default=None, help='Dataset name (default: derived from repo path)' ) + parser.add_argument( + '--library-mode', + action='store_true', + help='Seed the exported public API as entry points (for libraries with no main/route/CLI)' + ) args = parser.parse_args() @@ -1024,7 +1043,8 @@ def main(): processing_level=processing_level, skip_tests=args.skip_tests, depth=args.depth, - name=args.name + name=args.name, + library_mode=args.library_mode ) results = pipeline.run_full_pipeline() diff --git a/libs/openant-core/parsers/ruby/test_pipeline.py b/libs/openant-core/parsers/ruby/test_pipeline.py index cb61d15..228e3b2 100644 --- a/libs/openant-core/parsers/ruby/test_pipeline.py +++ b/libs/openant-core/parsers/ruby/test_pipeline.py @@ -48,7 +48,7 @@ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) from utilities.file_io import open_utf8, read_json, run_utf8, write_json from utilities.context_enhancer import ContextEnhancer -from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer +from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer, blackout_warning, library_seed_ids # Local imports from repository_scanner import RepositoryScanner @@ -78,7 +78,8 @@ def __init__( processing_level: ProcessingLevel = ProcessingLevel.ALL, skip_tests: bool = False, depth: int = 3, - name: str = None + name: str = None, + library_mode: bool = False ): self.repo_path = os.path.abspath(repo_path) self.output_dir = output_dir or os.path.join(os.path.dirname(__file__), 'test_output') @@ -89,6 +90,7 @@ def __init__( self.skip_tests = skip_tests self.depth = depth self.dataset_name = name + self.library_mode = library_mode # Pipeline artifacts self.scan_results_file = None @@ -283,6 +285,11 @@ def apply_reachability_filter(self) -> bool: detector = EntryPointDetector(normalized_functions, call_graph) self.entry_points = detector.detect_entry_points() + # Library-mode: seed the exported public API (a library has no + # main/route/CLI marker). Union-only — never drops a real entry point. + if self.library_mode: + self.entry_points = self.entry_points | library_seed_ids(normalized_functions) + # Build reachability reachability = ReachabilityAnalyzer( functions=normalized_functions, @@ -314,6 +321,13 @@ def apply_reachability_filter(self) -> bool: "reduction_percentage": round((1 - len(filtered_units) / original_count) * 100, 1) if original_count > 0 else 0 } + _blackout = blackout_warning(detector.entry_point_details, original_count, + len(filtered_units), + library_mode=getattr(self, "library_mode", False)) + if _blackout: + dataset["metadata"]["reachability_filter"]["warning"] = _blackout + print(f" [Warning] {_blackout}", file=sys.stderr) + write_json(self.dataset_file, dataset) elapsed = (datetime.now() - start_time).total_seconds() @@ -1003,6 +1017,11 @@ def main(): default=None, help='Dataset name (default: derived from repo path)' ) + parser.add_argument( + '--library-mode', + action='store_true', + help='Seed the exported public API as entry points (for libraries with no main/route/CLI)' + ) args = parser.parse_args() @@ -1024,7 +1043,8 @@ def main(): processing_level=processing_level, skip_tests=args.skip_tests, depth=args.depth, - name=args.name + name=args.name, + library_mode=args.library_mode ) results = pipeline.run_full_pipeline() diff --git a/libs/openant-core/parsers/zig/test_pipeline.py b/libs/openant-core/parsers/zig/test_pipeline.py index bbde219..c8a1193 100644 --- a/libs/openant-core/parsers/zig/test_pipeline.py +++ b/libs/openant-core/parsers/zig/test_pipeline.py @@ -49,6 +49,11 @@ def main(): "--skip-tests", action="store_true", help="Skip test files and functions" ) parser.add_argument("--name", help="Dataset name (defaults to repo directory name)") + parser.add_argument( + "--library-mode", + action="store_true", + help="Seed the exported public API as entry points (for libraries with no main/route/CLI)", + ) parser.add_argument( "--dependency-depth", type=int, @@ -129,7 +134,8 @@ def main(): # Apply processing level filters if args.processing_level != "all": call_graph_output = apply_processing_filter( - call_graph_output, args.processing_level, str(repo_path) + call_graph_output, args.processing_level, str(repo_path), + library_mode=args.library_mode, ) print( f" After {args.processing_level} filter: {len(call_graph_output['functions'])} functions", @@ -161,7 +167,7 @@ def main(): def apply_processing_filter( - call_graph_output: dict, level: str, repo_path: str + call_graph_output: dict, level: str, repo_path: str, library_mode: bool = False ) -> dict: """ Apply processing level filters to reduce the function set. @@ -173,21 +179,22 @@ def apply_processing_filter( - exploitable: Filter to reachable + CodeQL + LLM-classified exploitable """ if level == "reachable": - return apply_reachability_filter(call_graph_output, repo_path) + return apply_reachability_filter(call_graph_output, repo_path, library_mode=library_mode) elif level == "codeql": # First apply reachability, then would filter by CodeQL results - filtered = apply_reachability_filter(call_graph_output, repo_path) + filtered = apply_reachability_filter(call_graph_output, repo_path, library_mode=library_mode) # CodeQL filtering would be applied here if results exist return filtered elif level == "exploitable": # Apply all filters - filtered = apply_reachability_filter(call_graph_output, repo_path) + filtered = apply_reachability_filter(call_graph_output, repo_path, library_mode=library_mode) # CodeQL + LLM filtering would be applied here return filtered return call_graph_output -def apply_reachability_filter(call_graph_output: dict, repo_path: str) -> dict: +def apply_reachability_filter(call_graph_output: dict, repo_path: str, + library_mode: bool = False) -> dict: """Filter to functions reachable from entry points. Uses the real EntryPointDetector / ReachabilityAnalyzer contract, matching @@ -201,7 +208,7 @@ def apply_reachability_filter(call_graph_output: dict, repo_path: str) -> dict: --processing-level reachable. """ try: - from utilities.agentic_enhancer.entry_point_detector import EntryPointDetector + from utilities.agentic_enhancer.entry_point_detector import EntryPointDetector, blackout_warning, library_seed_ids from utilities.agentic_enhancer.reachability_analyzer import ReachabilityAnalyzer except ImportError: print( @@ -219,6 +226,11 @@ def apply_reachability_filter(call_graph_output: dict, repo_path: str) -> dict: detector = EntryPointDetector(functions, call_graph) entry_points = detector.detect_entry_points() + # Library-mode: seed the exported public API (a library has no main/route/CLI + # marker). Union-only — never drops a structurally-detected entry point. + if library_mode: + entry_points = entry_points | library_seed_ids(functions) + # Compute the reachable set via reverse-BFS from the entry points. analyzer = ReachabilityAnalyzer( functions=functions, @@ -247,6 +259,11 @@ def apply_reachability_filter(call_graph_output: dict, repo_path: str) -> dict: if k in reachable } + _blackout = blackout_warning(detector.entry_point_details, len(functions), + len(filtered_functions), library_mode=library_mode) + if _blackout: + print(f" [Warning] {_blackout}", file=sys.stderr) + return result diff --git a/libs/openant-core/tests/test_blackout_warning.py b/libs/openant-core/tests/test_blackout_warning.py new file mode 100644 index 0000000..9f2d7b3 --- /dev/null +++ b/libs/openant-core/tests/test_blackout_warning.py @@ -0,0 +1,61 @@ +"""Fix B — reachability blackout warning (advisory; never changes filtering). + +The #75 zero-seed net only fires at EXACTLY 0 entry points. A library like +tree-sitter trips a handful of INCIDENTAL seeds (code that merely contains an +input-reading pattern), yielding a 96.6% reduction that looks like a successful +filter while the real public-API core was dropped. `blackout_warning` catches +both the total blackout and this partial-blackout-with-only-incidental-seeds case, +and stays silent for a normal app (real route/main/CLI seeds, moderate reduction). +""" +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) # libs/openant-core + +from utilities.agentic_enhancer import blackout_warning # noqa: E402 + + +def _details(*reason_lists): + """Build an entry_point_details-shaped dict from per-seed reason lists.""" + return {f"f{i}": {"reasons": rs} for i, rs in enumerate(reason_lists)} + + +def test_total_blackout_warns(): + assert blackout_warning(_details(), original_count=500, reachable_count=0) is not None + + +def test_partial_blackout_incidental_seeds_warns(): + # tree-sitter shape: 4 incidental input_pattern seeds, 712 -> 24 (96.6% pruned). + details = _details(["input_pattern:fopen"], ["input_pattern:read"], + ["input_pattern:getenv"], ["input_pattern:scanf"]) + assert blackout_warning(details, original_count=712, reachable_count=24) is not None + + +def test_structural_seed_suppresses_even_high_reduction(): + # A real CLI/main seed means the high reduction is legitimate, not a blackout. + details = _details(["unit_type:main"], ["input_pattern:read"]) + assert blackout_warning(details, original_count=712, reachable_count=24) is None + + +def test_normal_app_reduction_is_silent(): + # Arkime C shape: route/main seeds, 1655 -> 608 (63% pruned). No warning. + details = _details(["unit_type:cli_handler"], ["unit_type:main"], ["unit_type:http_handler"]) + assert blackout_warning(details, original_count=1655, reachable_count=608) is None + + +def test_decorator_and_name_seeds_are_structural(): + assert blackout_warning(_details(["decorator:@app.route"]), + original_count=712, reachable_count=24) is None + assert blackout_warning(_details(["name:main"]), + original_count=712, reachable_count=24) is None + + +def test_library_mode_suppresses_warning(): + # With library-mode on, a high reduction is the intended precise result. + details = _details(["input_pattern:read"]) + assert blackout_warning(details, original_count=712, reachable_count=24, + library_mode=True) is None + + +def test_empty_dataset_no_warning(): + assert blackout_warning(_details(), original_count=0, reachable_count=0) is None diff --git a/libs/openant-core/tests/test_library_seed_ids.py b/libs/openant-core/tests/test_library_seed_ids.py new file mode 100644 index 0000000..e77071f --- /dev/null +++ b/libs/openant-core/tests/test_library_seed_ids.py @@ -0,0 +1,63 @@ +"""Fix C — shared library_seed_ids: public-API seed set, both key casings. + +The subprocess pipelines normalize function records to camelCase (`isExported`) +while the on-disk call_graph and the Python path use snake_case (`is_exported`). +`library_seed_ids` must seed the exported, non-name-private functions under EITHER +casing, defaulting to exported when neither field is present (over-seed, never +under-seed). This is what lets a C/JS/etc. library's public API surface seed the +reachability BFS instead of blacking out. +""" +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) # libs/openant-core + +from utilities.agentic_enhancer import library_seed_ids # noqa: E402 + + +def test_snake_case_exported_seeded(): + fns = {"f.c:pub": {"name": "pub", "is_exported": True}} + assert library_seed_ids(fns) == {"f.c:pub"} + + +def test_snake_case_static_not_seeded(): + fns = {"f.c:priv": {"name": "priv", "is_exported": False}} + assert library_seed_ids(fns) == set() + + +def test_camel_case_exported_seeded(): + # JS/normalized-pipeline shape. + fns = {"f.js:pub": {"name": "pub", "isExported": True}} + assert library_seed_ids(fns) == {"f.js:pub"} + + +def test_camel_case_unexported_not_seeded(): + fns = {"f.js:priv": {"name": "priv", "isExported": False}} + assert library_seed_ids(fns) == set() + + +def test_missing_field_defaults_exported(): + # Parsers without an export field (python/ruby/php) default to exported. + fns = {"m.py:helper": {"name": "helper"}} + assert library_seed_ids(fns) == {"m.py:helper"} + + +def test_leading_underscore_name_is_private(): + fns = { + "m.py:_internal": {"name": "_internal"}, + "m.py:public": {"name": "public"}, + } + assert library_seed_ids(fns) == {"m.py:public"} + + +def test_name_falls_back_to_func_id_tail(): + # No 'name' field -> derive from the func_id tail, strip a dotted qualifier. + fns = {"f.c:Mod.run": {"is_exported": True}} + assert library_seed_ids(fns) == {"f.c:Mod.run"} + + +def test_exported_but_underscore_still_excluded(): + # Name-private wins even when exported (a public-but-_-prefixed symbol is + # conventionally internal); over-seeding bias does not override the name rule. + fns = {"f.c:_x": {"name": "_x", "is_exported": True}} + assert library_seed_ids(fns) == set() diff --git a/libs/openant-core/utilities/agentic_enhancer/__init__.py b/libs/openant-core/utilities/agentic_enhancer/__init__.py index 3934926..a13756e 100644 --- a/libs/openant-core/utilities/agentic_enhancer/__init__.py +++ b/libs/openant-core/utilities/agentic_enhancer/__init__.py @@ -20,7 +20,7 @@ ) from .repository_index import RepositoryIndex, load_index_from_file from .tools import TOOL_DEFINITIONS, ToolExecutor -from .entry_point_detector import EntryPointDetector +from .entry_point_detector import EntryPointDetector, blackout_warning, library_seed_ids from .reachability_analyzer import ReachabilityAnalyzer __all__ = [ @@ -33,5 +33,7 @@ "TOOL_DEFINITIONS", "ToolExecutor", "EntryPointDetector", + "blackout_warning", + "library_seed_ids", "ReachabilityAnalyzer" ] diff --git a/libs/openant-core/utilities/agentic_enhancer/entry_point_detector.py b/libs/openant-core/utilities/agentic_enhancer/entry_point_detector.py index 5b278c5..737407a 100644 --- a/libs/openant-core/utilities/agentic_enhancer/entry_point_detector.py +++ b/libs/openant-core/utilities/agentic_enhancer/entry_point_detector.py @@ -276,3 +276,72 @@ def get_statistics(self) -> Dict: 'by_unit_type': by_type, 'by_reason_category': by_reason, } + + +def library_seed_ids(functions): + """Public-API seed set for library-mode reachability. + + A pure library exposes no main/route/CLI entry point, so the structural + detector finds nothing and the whole library is filtered out (0 reachable). + In library-mode the *public surface* IS the entry surface: seed every + exported/public function and let the forward BFS pull in its callees. + + Public = exported AND not name-private. Honours ``is_exported``/``isExported`` + when the parser provides it (C/Go/JS exclude static/unexported); for parsers + without the field (python/ruby/php) it defaults True and the leading-underscore + name heuristic decides. Both key casings are accepted because the subprocess + pipelines normalize to camelCase while the on-disk call_graph is snake_case. + The bias is intentionally toward over-seeding (more reachable = more analysed), + never under-seeding. + """ + seeds = set() + for func_id, fd in functions.items(): + name = (fd.get("name") or func_id.rsplit(":", 1)[-1]).split(".")[-1] + exported = fd.get("is_exported", fd.get("isExported", True)) + if exported and not name.startswith("_"): + seeds.add(func_id) + return seeds + + +# Reason categories that indicate a STRUCTURAL entry point — a real route, program +# main, CLI command, framework handler, or decorator-marked endpoint — as opposed +# to an INCIDENTAL match (code merely contains an input-reading pattern). A result +# seeded ONLY by incidental matches is the library-blackout signature: the public +# API was never a seed, so the BFS dropped the core. +_STRUCTURAL_REASON_CATEGORIES = {"unit_type", "decorator", "name"} + + +def blackout_warning(entry_point_details, original_count, reachable_count, + library_mode=False, reduction_threshold=0.90): + """Advisory string when a reachability result looks like a silent library + blackout, else None. This is ADVISORY ONLY — it never changes which units + are kept. + + Two triggers (both off when ``library_mode`` is set, since then the public + API was deliberately seeded and a high reduction is the intended result): + * total blackout — 0 of N units kept (no seedable frontier); or + * partial blackout — >= ``reduction_threshold`` pruned AND no STRUCTURAL + entry point was found (every seed is an incidental ``input_pattern`` + match). This is the case that slips past the zero-seed net: a handful of + incidental seeds yield a 96%+ reduction that looks like success while the + real public API surface was never analysed (e.g. a C/JS parser library). + """ + if original_count <= 0 or library_mode: + return None + if reachable_count == 0: + return (f"Reachability kept 0 of {original_count} units — total blackout " + f"(no entry point could seed the frontier). If this is a library, " + f"re-run with --library-mode to seed the exported public API surface.") + reduction = 1.0 - (reachable_count / original_count) + structural = sum( + 1 for d in (entry_point_details or {}).values() + if any(r.split(":", 1)[0] in _STRUCTURAL_REASON_CATEGORIES + for r in d.get("reasons", [])) + ) + if reduction >= reduction_threshold and structural == 0: + return (f"Reachability kept {reachable_count} of {original_count} units " + f"({reduction * 100:.0f}% pruned) but found NO structural entry point " + f"(route/main/CLI/handler) — only incidental code-pattern seeds. This is " + f"the library-blackout pattern: the public API was not seeded, so the core " + f"was dropped. Re-run with --library-mode to seed the exported public API.") + return None