Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 51 additions & 15 deletions libs/openant-core/core/parser_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ def parse_repository(
name: str = None,
diff_manifest: str | None = None,
fresh: bool = False,
library_mode: bool = False,
) -> ParseResult:
"""Parse a repository into an OpenAnt dataset.

Expand All @@ -96,6 +97,8 @@ def parse_repository(
fresh: If True, delete existing dataset.json before parsing so all
units are regenerated from scratch. Only dataset.json is deleted;
other artifacts in output_dir (e.g. analyzer outputs) are preserved.
library_mode: If True, seed the public API surface as reachability
entry points (opt-in, union-only).

Returns:
ParseResult with paths to generated files and stats.
Expand Down Expand Up @@ -127,19 +130,19 @@ def parse_repository(

# Dispatch to the right parser
if language == "python":
result = _parse_python(repo_path, output_dir, processing_level, skip_tests, name)
result = _parse_python(repo_path, output_dir, processing_level, skip_tests, name, library_mode)
elif language == "javascript":
result = _parse_javascript(repo_path, output_dir, processing_level, skip_tests, name)
result = _parse_javascript(repo_path, output_dir, processing_level, skip_tests, name, library_mode)
elif language == "go":
result = _parse_go(repo_path, output_dir, processing_level, skip_tests, name)
result = _parse_go(repo_path, output_dir, processing_level, skip_tests, name, library_mode)
elif language == "c":
result = _parse_c(repo_path, output_dir, processing_level, skip_tests, name)
result = _parse_c(repo_path, output_dir, processing_level, skip_tests, name, library_mode)
elif language == "ruby":
result = _parse_ruby(repo_path, output_dir, processing_level, skip_tests, name)
result = _parse_ruby(repo_path, output_dir, processing_level, skip_tests, name, library_mode)
elif language == "php":
result = _parse_php(repo_path, output_dir, processing_level, skip_tests, name)
result = _parse_php(repo_path, output_dir, processing_level, skip_tests, name, library_mode)
elif language == "zig":
result = _parse_zig(repo_path, output_dir, processing_level, skip_tests, name)
result = _parse_zig(repo_path, output_dir, processing_level, skip_tests, name, library_mode)
else:
raise ValueError(f"Unsupported language: {language}")

Expand Down Expand Up @@ -207,11 +210,18 @@ def _maybe_apply_diff_filter(
# Reachability filter (shared by Python path; JS/Go handle it internally)
# ---------------------------------------------------------------------------

# library_seed_ids is now shared in utilities/agentic_enhancer/entry_point_detector.py
# so every parser pipeline (not just Python) can seed the public API. It is loaded
# below via the same importlib path as EntryPointDetector to dodge the heavy
# utilities/__init__ imports.


def apply_reachability_filter(
dataset: dict,
output_dir: str,
processing_level: str,
extra_entry_points: "set[str] | None" = None,
library_mode: bool = False,
) -> dict:
"""Filter dataset units to only those reachable from entry points.

Expand Down Expand Up @@ -254,6 +264,8 @@ def _load_module(name, filename):
_epd = _load_module("entry_point_detector", "entry_point_detector.py")
_ra = _load_module("reachability_analyzer", "reachability_analyzer.py")
EntryPointDetector = _epd.EntryPointDetector
blackout_warning = _epd.blackout_warning
library_seed_ids = _epd.library_seed_ids
ReachabilityAnalyzer = _ra.ReachabilityAnalyzer

call_graph_path = os.path.join(output_dir, "call_graph.json")
Expand All @@ -277,6 +289,11 @@ def _load_module(name, filename):
entry_points = detector.detect_entry_points()
if extra_entry_points:
entry_points = entry_points | extra_entry_points
# Library-mode (opt-in): the public API is the entry surface. Union-only —
# never demotes a structurally-detected app entry point, so an app scan with
# the flag on can only gain reachable units, never lose one.
if library_mode:
entry_points = entry_points | library_seed_ids(functions)

units = dataset.get("units", [])
original_count = len(units)
Expand Down Expand Up @@ -349,6 +366,12 @@ def _load_module(name, filename):
file=sys.stderr,
)

_blackout = blackout_warning(detector.entry_point_details, original_count,
len(filtered_units), library_mode=library_mode)
if _blackout:
dataset["metadata"]["reachability_filter"]["warning"] = _blackout
print(f" [Warning] {_blackout}", file=sys.stderr)

# Warn about unimplemented higher-level filters
if processing_level == "codeql":
print(
Expand All @@ -374,7 +397,7 @@ def _load_module(name, filename):
# Python parser
# ---------------------------------------------------------------------------

def _parse_python(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult:
def _parse_python(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult:
"""Invoke the Python parser.

The Python parser has a clean `parse_repository()` function that we can
Expand Down Expand Up @@ -402,7 +425,8 @@ def _parse_python(repo_path: str, output_dir: str, processing_level: str, skip_t

# Apply reachability filter if processing_level requires it
if processing_level != "all":
dataset = _apply_reachability_filter(dataset, output_dir, processing_level)
dataset = _apply_reachability_filter(dataset, output_dir, processing_level,
library_mode=library_mode)

# Write outputs
write_json(dataset_path, dataset)
Expand Down Expand Up @@ -523,7 +547,7 @@ def _file_lock(lock_path: Path):
f.close()


def _parse_javascript(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult:
def _parse_javascript(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult:
"""Invoke the JavaScript/TypeScript parser.

The JS parser is a PipelineTest class that runs Node.js subprocesses.
Expand All @@ -547,6 +571,8 @@ def _parse_javascript(repo_path: str, output_dir: str, processing_level: str, sk
cmd.extend(["--name", name])
if skip_tests:
cmd.append("--skip-tests")
if library_mode:
cmd.append("--library-mode")

result = subprocess.run(
cmd,
Expand Down Expand Up @@ -582,7 +608,7 @@ def _parse_javascript(repo_path: str, output_dir: str, processing_level: str, sk
# Go parser
# ---------------------------------------------------------------------------

def _parse_go(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult:
def _parse_go(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult:
"""Invoke the Go parser.

The Go parser is a PipelineTest class that calls a compiled Go binary.
Expand All @@ -603,6 +629,8 @@ def _parse_go(repo_path: str, output_dir: str, processing_level: str, skip_tests
cmd.extend(["--name", name])
if skip_tests:
cmd.append("--skip-tests")
if library_mode:
cmd.append("--library-mode")

result = subprocess.run(
cmd,
Expand Down Expand Up @@ -638,7 +666,7 @@ def _parse_go(repo_path: str, output_dir: str, processing_level: str, skip_tests
# C/C++ parser
# ---------------------------------------------------------------------------

def _parse_c(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult:
def _parse_c(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult:
"""Invoke the C/C++ parser.

The C parser uses tree-sitter for function extraction and call graph
Expand All @@ -661,6 +689,8 @@ def _parse_c(repo_path: str, output_dir: str, processing_level: str, skip_tests:
cmd.extend(["--name", name])
if skip_tests:
cmd.append("--skip-tests")
if library_mode:
cmd.append("--library-mode")

result = subprocess.run(
cmd,
Expand Down Expand Up @@ -697,7 +727,7 @@ def _parse_c(repo_path: str, output_dir: str, processing_level: str, skip_tests:
# Ruby parser
# ---------------------------------------------------------------------------

def _parse_ruby(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult:
def _parse_ruby(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult:
"""Invoke the Ruby parser.

The Ruby parser uses tree-sitter for function extraction and call graph
Expand All @@ -720,6 +750,8 @@ def _parse_ruby(repo_path: str, output_dir: str, processing_level: str, skip_tes
cmd.extend(["--name", name])
if skip_tests:
cmd.append("--skip-tests")
if library_mode:
cmd.append("--library-mode")

result = subprocess.run(
cmd,
Expand Down Expand Up @@ -756,7 +788,7 @@ def _parse_ruby(repo_path: str, output_dir: str, processing_level: str, skip_tes
# PHP parser
# ---------------------------------------------------------------------------

def _parse_php(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult:
def _parse_php(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult:
"""Invoke the PHP parser.

The PHP parser uses tree-sitter for function extraction and call graph
Expand All @@ -779,6 +811,8 @@ def _parse_php(repo_path: str, output_dir: str, processing_level: str, skip_test
cmd.extend(["--name", name])
if skip_tests:
cmd.append("--skip-tests")
if library_mode:
cmd.append("--library-mode")

result = subprocess.run(
cmd,
Expand Down Expand Up @@ -815,7 +849,7 @@ def _parse_php(repo_path: str, output_dir: str, processing_level: str, skip_test
# Zig parser
# ---------------------------------------------------------------------------

def _parse_zig(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None) -> ParseResult:
def _parse_zig(repo_path: str, output_dir: str, processing_level: str, skip_tests: bool = True, name: str = None, library_mode: bool = False) -> ParseResult:
"""Invoke the Zig parser.

The Zig parser uses tree-sitter for function extraction and call graph
Expand All @@ -838,6 +872,8 @@ def _parse_zig(repo_path: str, output_dir: str, processing_level: str, skip_test
cmd.extend(["--name", name])
if skip_tests:
cmd.append("--skip-tests")
if library_mode:
cmd.append("--library-mode")

result = subprocess.run(
cmd,
Expand Down
2 changes: 2 additions & 0 deletions libs/openant-core/core/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def scan_repository(
diff_manifest: str | None = None,
llm_reachability: bool = False,
llm_reachability_max_code_bytes: int = 1500,
library_mode: bool = False,
) -> ScanResult:
"""Scan a repository for vulnerabilities.

Expand Down Expand Up @@ -171,6 +172,7 @@ def _step_label(name: str) -> str:
processing_level=effective_parse_level,
skip_tests=skip_tests,
diff_manifest=diff_manifest,
library_mode=library_mode,
)

ctx.summary = {
Expand Down
6 changes: 6 additions & 0 deletions libs/openant-core/openant/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@ def cmd_scan(args):
repo_url=getattr(args, "repo_url", None),
commit_sha=getattr(args, "commit_sha", None),
diff_manifest=getattr(args, "diff_manifest", None),
library_mode=getattr(args, "library_mode", False),
llm_reachability=getattr(args, "llm_reachability", False),
llm_reachability_max_code_bytes=getattr(
args, "llm_reachability_max_code_bytes", 1500
Expand Down Expand Up @@ -129,6 +130,7 @@ def cmd_parse(args):
name=getattr(args, "name", None),
diff_manifest=getattr(args, "diff_manifest", None),
fresh=getattr(args, "fresh", False),
library_mode=getattr(args, "library_mode", False),
)

ctx.summary = {
Expand Down Expand Up @@ -995,6 +997,8 @@ def main():
scan_p.add_argument("--dynamic-test", action="store_true",
help="Enable Docker-isolated dynamic testing (off by default)")
scan_p.add_argument("--no-skip-tests", action="store_true", help="Include test files in parsing (default: tests are skipped)")
scan_p.add_argument("--library-mode", action="store_true",
help="Seed the exported public API as entry points (for libraries with no main/route/CLI entry point)")
scan_p.add_argument("--limit", type=int, help="Max units to analyze")
scan_p.add_argument(
"--llm-config",
Expand Down Expand Up @@ -1058,6 +1062,8 @@ def main():
help="Processing level (default: reachable)",
)
parse_p.add_argument("--no-skip-tests", action="store_true", help="Include test files in parsing (default: tests are skipped)")
parse_p.add_argument("--library-mode", action="store_true",
help="Seed the exported public API as entry points (for libraries with no main/route/CLI entry point)")
parse_p.add_argument("--name", help="Dataset name (default: derived from repo path)")
parse_p.add_argument("--diff-manifest", help="Path to diff_manifest.json; tags units with diff_selected")
parse_p.add_argument("--fresh", action="store_true",
Expand Down
28 changes: 25 additions & 3 deletions libs/openant-core/parsers/c/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
from utilities.file_io import open_utf8, read_json, run_utf8, write_json
from utilities.context_enhancer import ContextEnhancer
from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer
from utilities.agentic_enhancer import EntryPointDetector, ReachabilityAnalyzer, blackout_warning, library_seed_ids

# Local imports
from repository_scanner import RepositoryScanner
Expand Down Expand Up @@ -87,7 +87,8 @@ def __init__(
processing_level: ProcessingLevel = ProcessingLevel.ALL,
skip_tests: bool = False,
depth: int = 3,
name: str = None
name: str = None,
library_mode: bool = False
):
self.repo_path = os.path.abspath(repo_path)
self.output_dir = output_dir or os.path.join(os.path.dirname(__file__), 'test_output')
Expand All @@ -98,6 +99,7 @@ def __init__(
self.skip_tests = skip_tests
self.depth = depth
self.dataset_name = name
self.library_mode = library_mode

# Pipeline artifacts
self.scan_results_file = None
Expand Down Expand Up @@ -292,6 +294,13 @@ def apply_reachability_filter(self) -> bool:
detector = EntryPointDetector(normalized_functions, call_graph)
self.entry_points = detector.detect_entry_points()

# Library-mode: a library's entry surface is its exported public API,
# which carries no main/route/CLI marker. Seed it so the BFS reaches
# the core instead of blacking out. Union-only — never drops a
# structurally-detected entry point.
if self.library_mode:
self.entry_points = self.entry_points | library_seed_ids(normalized_functions)

# Build reachability
reachability = ReachabilityAnalyzer(
functions=normalized_functions,
Expand Down Expand Up @@ -323,6 +332,13 @@ def apply_reachability_filter(self) -> bool:
"reduction_percentage": round((1 - len(filtered_units) / original_count) * 100, 1) if original_count > 0 else 0
}

_blackout = blackout_warning(detector.entry_point_details, original_count,
len(filtered_units),
library_mode=getattr(self, "library_mode", False))
if _blackout:
dataset["metadata"]["reachability_filter"]["warning"] = _blackout
print(f" [Warning] {_blackout}", file=sys.stderr)

write_json(self.dataset_file, dataset)

elapsed = (datetime.now() - start_time).total_seconds()
Expand Down Expand Up @@ -1027,6 +1043,11 @@ def main():
default=None,
help='Dataset name (default: derived from repo path)'
)
parser.add_argument(
'--library-mode',
action='store_true',
help='Seed the exported public API as entry points (for libraries with no main/route/CLI)'
)

args = parser.parse_args()

Expand All @@ -1048,7 +1069,8 @@ def main():
processing_level=processing_level,
skip_tests=args.skip_tests,
depth=args.depth,
name=args.name
name=args.name,
library_mode=args.library_mode
)
results = pipeline.run_full_pipeline()

Expand Down
Loading