diff --git a/clang/docs/EPOCH_ALLOCATOR.md b/clang/docs/EPOCH_ALLOCATOR.md new file mode 100644 index 00000000..68a6cbe0 --- /dev/null +++ b/clang/docs/EPOCH_ALLOCATOR.md @@ -0,0 +1,120 @@ +# HVM4 Epoch Nursery Allocator + +## Problem + +The current HVM4 allocator is a per-thread bump allocator that never reclaims memory. +For iterative algorithms (e.g., Bellman-Ford pathfinding), each round allocates O(T) +words for evaluation intermediates, leading to O(R × T) total heap consumption over R +rounds. The heap exhausts at HEAP_CAP and the program crashes. + +The `@compact` primitive deep-copies live data to fresh heap positions but doesn't +reclaim the old space. So total heap grows as O(R × T + R × live_size). + +## Key Insight: IC is Cycle-Free + +Interaction Calculus (IC) guarantees cycle-freedom through: +- **Affine variables**: each used at most once +- **Consumption-based rewriting**: interacting nodes are consumed +- **Write-once substitution**: variable binding is final + +This means: after normalization to SNF, all evaluation intermediates are dead. Only the +result tree is live. No reference cycles exist. This makes epoch-based reclamation +provably complete — no GC tracing needed. + +## Architecture + +``` +HEAP[0 ................. STABLE_CAP ................. HEAP_CAP] + [--- Stable Region ---][--- Nursery (per-thread slices) ---] +``` + +### Stable Region: `HEAP[1, STABLE_CAP)` +- Holds compacted/promoted data that survives across epochs +- Single bump allocator (`STABLE_NEXT`) +- Called only from `compact_deep_copy` (single-threaded, after normalization) +- Grows monotonically; for long-running computations, periodic stable compaction + can reclaim dead stable data (future work) + +### Nursery: `HEAP[STABLE_CAP, HEAP_CAP)` (per-thread slices) +- Each thread gets `(HEAP_CAP - STABLE_CAP) / num_threads` words +- Same bump allocation as current `heap_alloc()` — zero overhead change +- At epoch boundary: reset `HEAP_NEXT[t]` to nursery start (O(1) per thread) +- Free lists cleared on reset (they may contain nursery locations) + +## Epoch Lifecycle + +``` +┌─────────────────────────────────────────┐ +│ 1. epoch_begin() │ Mark epoch start +│ 2. Build term (allocates in nursery) │ Same as current heap_alloc +│ 3. eval_normalize(term) │ Multi-threaded normalization +│ 4. compact_deep_copy(result) │ Copy live → stable region +│ 5. epoch_reset() │ O(1) nursery reclaim +│ 6. Use compacted result for next round │ +│ 7. → goto 1 │ +└─────────────────────────────────────────┘ +``` + +## Performance Characteristics + +| Operation | Complexity | Notes | +|----------------|------------|------------------------------------| +| Nursery alloc | O(1) | Bump pointer, identical to current | +| Epoch reset | O(threads) | Pointer rewind per thread | +| Compact | O(live) | Deep-copy of surviving data | +| Stable alloc | O(1) | Bump pointer | + +**Zero per-object overhead**: No headers, no refcounts, no tags. The nursery bump +allocator is byte-for-byte identical to the current `heap_alloc`. + +**Memory savings**: Over R rounds with tree size T and nursery allocs N per round: +- Without epoch: O(R × N) total heap (linear growth → OOM) +- With epoch: O(R × T) stable + O(N) nursery (constant nursery, growing stable) +- Since T << N typically, this is a massive improvement + +## API + +```c +// Initialize epoch mode (replaces heap_init_slices) +void epoch_init(u32 stable_fraction); // e.g., 4 = 1/4 of HEAP for stable + +// Epoch lifecycle +void epoch_begin(void); // Start new epoch +void epoch_reset(void); // O(1) nursery reclaim (fast, no zeroing) +void epoch_reset_zero(void); // Debug: zero nursery after reset + +// Stable allocation (for compact) +u64 heap_alloc_stable(u64 size); // Bump in stable region + +// Stats +u64 epoch_nursery_used(u32 tid); // Words used by thread this epoch +u64 epoch_nursery_used_total(void); // Total across all threads +u64 epoch_stable_used(void); // Stable words used +void epoch_print_stats(void); // Print summary to stderr +``` + +## Integration + +- `heap_alloc()` is **unchanged** — it bumps in the thread's slice, which epoch_init + points to the nursery region instead of the full heap +- `compact_deep_copy()` uses `heap_alloc_stable()` when `EPOCH_ENABLED` +- `eval_normalize()` is unchanged — multi-threaded normalization works as before +- `epoch_reset()` must be called **after** all threads have joined (which + `eval_normalize` guarantees with its pthread_join barrier) + +## CLI + +``` +./main --epoch # Enable epoch mode (stable = 1/4 HEAP) +./main --epoch=8 # Stable = 1/8 HEAP (larger nursery) +./main --epoch-bench # Run allocation microbenchmark +``` + +## Future Work + +- **Stable compaction**: When stable fills up, deep-copy live stable data to compact it +- **Ref-counted stable**: IC cycle-freedom makes per-object ref-counting trivially + correct for stable data; could enable fine-grained stable recycling +- **Ring-backed nursery overflow**: If nursery is exhausted mid-epoch, spill to a + secondary ElasticRing-backed overflow buffer (currently: error + exit) +- **Concurrent compact**: Parallelize deep-copy across threads for large result trees diff --git a/clang/heap/epoch.c b/clang/heap/epoch.c new file mode 100644 index 00000000..dcf1e6bd --- /dev/null +++ b/clang/heap/epoch.c @@ -0,0 +1,354 @@ +// heap/epoch.c — Ring-backed epoch nursery for HVM4 +// +// Splits HEAP into a stable region (for compacted data) and per-thread nursery +// slices (for ephemeral evaluation intermediates). At epoch boundary, nursery +// pointers reset in O(1), bulk-reclaiming all evaluation garbage. +// +// IC's cycle-freedom guarantees: after normalization to SNF, all nursery data +// except the result tree is dead. compact_deep_copy moves the live result to +// stable, making nursery safe to reclaim without tracing. +// +// heap_alloc() is UNCHANGED — it bumps in the thread's slice (which epoch_init +// points to the nursery region). Zero allocation overhead vs. current allocator. + +static int EPOCH_ENABLED = 0; +static u32 EPOCH_DEPTH = 0; // Nesting depth for nested @compact calls + +// Stable region: [1, STABLE_CAP) +static u64 STABLE_CAP = 0; +static u64 STABLE_NEXT = 0; + +// Per-thread nursery start positions (for reset) +// Uses same HEAP_STRIDE padding as HEAP_NEXT/HEAP_END for cache alignment +static u64 NURSERY_START[MAX_THREADS * HEAP_STRIDE] __attribute__((aligned(256))) = {0}; +#define NURSERY_START_AT(t) NURSERY_START[(u64)(t) * HEAP_STRIDE] + +// Stats +typedef struct { + u64 epochs; + u64 nursery_words_reclaimed; + u64 stable_words_allocated; + u64 peak_nursery_used; +} EpochStats; + +static EpochStats EPOCH_STATS = {0}; + +// ============================================================ +// Initialization +// ============================================================ + +// Initialize epoch mode. Replaces heap_init_slices(). +// stable_fraction: denominator for stable size (4 = 1/4 of HEAP for stable). +// Requires stable_fraction >= 2 (at least half the heap for nursery). +fn void epoch_init(u32 stable_fraction) { + u32 threads = thread_get_count(); + u64 words = HEAP_CAP; + + // Stable region: first 1/stable_fraction of HEAP + if (stable_fraction < 2) stable_fraction = 4; + STABLE_CAP = words / stable_fraction; + if (STABLE_CAP < 4096) STABLE_CAP = 4096; + STABLE_NEXT = 1; // Skip location 0 (null sentinel) + + // Nursery region: rest of HEAP, split equally per thread + u64 nursery_total = words - STABLE_CAP; + u64 nursery_per_thread = nursery_total / threads; + + for (u32 t = 0; t < threads; t++) { + u64 start = STABLE_CAP + (u64)t * nursery_per_thread; + u64 end = (t == threads - 1) ? words : start + nursery_per_thread; + NURSERY_START_AT(t) = start; + HEAP_NEXT_AT(t) = start; + HEAP_END_AT(t) = end; + } + + // Clear unused thread slots + for (u32 t = threads; t < MAX_THREADS; t++) { + NURSERY_START_AT(t) = 0; + HEAP_NEXT_AT(t) = 0; + HEAP_END_AT(t) = 0; + } + + EPOCH_ENABLED = 1; + EPOCH_DEPTH = 0; + memset(&EPOCH_STATS, 0, sizeof(EPOCH_STATS)); +} + +// ============================================================ +// Stable allocation +// ============================================================ + +// Allocate from the stable region. Single-threaded: only called from +// compact_deep_copy on thread 0, after eval_normalize has joined all threads. +fn u64 heap_alloc_stable(u64 size) { + u64 at = STABLE_NEXT; + u64 next = at + size; + if (__builtin_expect(next <= STABLE_CAP && next >= at, 1)) { + STABLE_NEXT = next; + EPOCH_STATS.stable_words_allocated += size; + return at; + } + fprintf(stderr, + "Out of stable heap memory (used %llu / %llu words, need %llu)\n" + "Hint: use --epoch=N with smaller N for more stable space\n", + (unsigned long long)STABLE_NEXT, + (unsigned long long)STABLE_CAP, + (unsigned long long)size); + exit(1); +} + +// ============================================================ +// Epoch lifecycle +// ============================================================ + +// Begin a new epoch. Supports nesting: only the outermost begin/reset pair +// actually resets the nursery. Inner @compact calls are no-ops for epoch lifecycle. +fn void epoch_begin(void) { + EPOCH_DEPTH++; +} + +// Reset all nursery regions. O(threads) pointer rewinds. +// MUST be called after compact has copied all live data to stable. +// After this call, ALL nursery locations contain stale data. +// Supports nesting: only the outermost reset actually reclaims memory. +fn void epoch_reset(void) { + if (EPOCH_DEPTH > 1) { + EPOCH_DEPTH--; + return; // Inner @compact — don't reset nursery + } + EPOCH_DEPTH = 0; + + u32 threads = thread_get_count(); + u64 total_used = 0; + + for (u32 t = 0; t < threads; t++) { + u64 used = HEAP_NEXT_AT(t) - NURSERY_START_AT(t); + total_used += used; + HEAP_NEXT_AT(t) = NURSERY_START_AT(t); + } + + // Clear free lists (may contain nursery locations from heap_free) + heap_free_reset(); + + EPOCH_STATS.nursery_words_reclaimed += total_used; + if (total_used > EPOCH_STATS.peak_nursery_used) { + EPOCH_STATS.peak_nursery_used = total_used; + } + EPOCH_STATS.epochs++; +} + +// Debug variant: zero nursery memory after reset. +// WARNING: Writing 0 to nursery slots makes heap_read() spin-wait on those +// locations. Only use single-threaded when no other thread could be reading. +// O(nursery_used) — catches stale-pointer bugs but is slower than epoch_reset. +fn void epoch_reset_zero(void) { + if (EPOCH_DEPTH > 1) { + EPOCH_DEPTH--; + return; + } + EPOCH_DEPTH = 0; + + u32 threads = thread_get_count(); + u64 total_used = 0; + + for (u32 t = 0; t < threads; t++) { + u64 start = NURSERY_START_AT(t); + u64 end = HEAP_NEXT_AT(t); + u64 used = end - start; + total_used += used; + if (used > 0) { + memset(&HEAP[start], 0, used * sizeof(Term)); + } + HEAP_NEXT_AT(t) = start; + } + + heap_free_reset(); + + EPOCH_STATS.nursery_words_reclaimed += total_used; + if (total_used > EPOCH_STATS.peak_nursery_used) { + EPOCH_STATS.peak_nursery_used = total_used; + } + EPOCH_STATS.epochs++; +} + +// ============================================================ +// Queries +// ============================================================ + +fn u64 epoch_nursery_used(u32 tid) { + return HEAP_NEXT_AT(tid) - NURSERY_START_AT(tid); +} + +fn u64 epoch_nursery_used_total(void) { + u32 threads = thread_get_count(); + u64 total = 0; + for (u32 t = 0; t < threads; t++) { + total += HEAP_NEXT_AT(t) - NURSERY_START_AT(t); + } + return total; +} + +fn u64 epoch_stable_used(void) { + return STABLE_NEXT - 1; // -1 because we start at 1 +} + +fn int epoch_is_stable(u32 loc) { + return loc > 0 && (u64)loc < STABLE_CAP; +} + +// Reset stable allocation pointer. Used by benchmarks to re-run stable alloc tests. +fn void epoch_reset_stable(void) { + STABLE_NEXT = 1; +} + +// ============================================================ +// Stats +// ============================================================ + +fn void epoch_print_stats(void) { + if (!EPOCH_ENABLED) return; + fprintf(stderr, "[epoch] mode: enabled (stable=1/%llu of HEAP)\n", + (unsigned long long)(HEAP_CAP / STABLE_CAP)); + fprintf(stderr, "[epoch] epochs completed: %llu\n", + (unsigned long long)EPOCH_STATS.epochs); + fprintf(stderr, "[epoch] stable: %llu / %llu words (%.1f%%)\n", + (unsigned long long)epoch_stable_used(), + (unsigned long long)STABLE_CAP, + 100.0 * (double)epoch_stable_used() / (double)STABLE_CAP); + fprintf(stderr, "[epoch] nursery reclaimed: %llu words total\n", + (unsigned long long)EPOCH_STATS.nursery_words_reclaimed); + fprintf(stderr, "[epoch] peak nursery per epoch: %llu words\n", + (unsigned long long)EPOCH_STATS.peak_nursery_used); +} + +// ============================================================ +// Benchmark +// ============================================================ + +fn int epoch_bench(void) { + fprintf(stderr, "[epoch_bench] Starting allocation benchmark...\n"); + + // Setup: use epoch mode with stable = 1/4 + epoch_init(4); + + u64 nursery_cap = HEAP_END_AT(0) - NURSERY_START_AT(0); + fprintf(stderr, "[epoch_bench] Nursery capacity (thread 0): %llu words (%.1f MB)\n", + (unsigned long long)nursery_cap, + (double)(nursery_cap * sizeof(Term)) / (1024.0 * 1024.0)); + fprintf(stderr, "[epoch_bench] Stable capacity: %llu words (%.1f MB)\n", + (unsigned long long)STABLE_CAP, + (double)(STABLE_CAP * sizeof(Term)) / (1024.0 * 1024.0)); + + struct timespec t0, t1; + + // Benchmark 1: Nursery allocation throughput (single-threaded) + { + u64 alloc_count = 0; + u64 alloc_words = 0; + clock_gettime(CLOCK_MONOTONIC, &t0); + + // Allocate until nursery is ~80% full + u64 limit = nursery_cap * 8 / 10; + while (epoch_nursery_used(0) < limit) { + u64 loc = heap_alloc(2); // Typical: 2-word nodes + heap_set((u32)loc, 42); + heap_set((u32)(loc + 1), 43); + alloc_count++; + alloc_words += 2; + } + + clock_gettime(CLOCK_MONOTONIC, &t1); + double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + fprintf(stderr, "[epoch_bench] Nursery alloc: %llu allocs (%llu words) in %.4f s = %.1f M allocs/s\n", + (unsigned long long)alloc_count, + (unsigned long long)alloc_words, + dt, alloc_count / dt / 1e6); + } + + // Benchmark 2: Epoch reset latency + { + u64 nursery_used = epoch_nursery_used(0); + clock_gettime(CLOCK_MONOTONIC, &t0); + + u32 reset_count = 10000; + for (u32 i = 0; i < reset_count; i++) { + epoch_reset(); + // Re-fill a small amount so reset has something to do + heap_alloc(16); + } + + clock_gettime(CLOCK_MONOTONIC, &t1); + double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + fprintf(stderr, "[epoch_bench] Epoch reset: %u resets in %.6f s = %.0f ns/reset\n", + reset_count, dt, dt / reset_count * 1e9); + fprintf(stderr, "[epoch_bench] (first reset reclaimed %llu words)\n", + (unsigned long long)nursery_used); + } + + // Benchmark 3: Stable allocation throughput + { + u64 alloc_count = 0; + u64 limit = STABLE_CAP * 8 / 10; + clock_gettime(CLOCK_MONOTONIC, &t0); + + while (epoch_stable_used() < limit) { + u64 loc = heap_alloc_stable(2); + heap_set((u32)loc, 42); + heap_set((u32)(loc + 1), 43); + alloc_count++; + } + + clock_gettime(CLOCK_MONOTONIC, &t1); + double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + fprintf(stderr, "[epoch_bench] Stable alloc: %llu allocs in %.4f s = %.1f M allocs/s\n", + (unsigned long long)alloc_count, dt, alloc_count / dt / 1e6); + } + + // Benchmark 4: Multi-epoch simulation + { + // Reset everything for clean measurement + epoch_reset_stable(); + epoch_reset(); + + u32 num_epochs = 100; + u64 allocs_per_epoch = 100000; + u64 live_per_epoch = 1000; // Words that "survive" to stable + + clock_gettime(CLOCK_MONOTONIC, &t0); + + for (u32 e = 0; e < num_epochs; e++) { + epoch_begin(); + + // Simulate normalization: many nursery allocs + for (u64 i = 0; i < allocs_per_epoch; i++) { + u64 loc = heap_alloc(2); + heap_set((u32)loc, (Term)(i + 1)); + heap_set((u32)(loc + 1), (Term)(i + 2)); + } + + // Simulate compact: promote small amount to stable + for (u64 i = 0; i < live_per_epoch; i++) { + u64 loc = heap_alloc_stable(1); + heap_set((u32)loc, (Term)(e * 1000 + i)); + } + + // Reset nursery + epoch_reset(); + } + + clock_gettime(CLOCK_MONOTONIC, &t1); + double dt = (t1.tv_sec - t0.tv_sec) + (t1.tv_nsec - t0.tv_nsec) / 1e9; + fprintf(stderr, "[epoch_bench] Multi-epoch: %u epochs in %.4f s = %.2f ms/epoch\n", + num_epochs, dt, dt / num_epochs * 1e3); + fprintf(stderr, "[epoch_bench] Nursery allocs/epoch: %llu (reclaimed each time)\n", + (unsigned long long)allocs_per_epoch); + fprintf(stderr, "[epoch_bench] Stable after %u epochs: %llu words\n", + num_epochs, (unsigned long long)epoch_stable_used()); + fprintf(stderr, "[epoch_bench] Without epochs would use: %llu words\n", + (unsigned long long)(num_epochs * allocs_per_epoch * 2)); + } + + epoch_print_stats(); + fprintf(stderr, "[epoch_bench] All benchmarks passed\n"); + return 0; +} diff --git a/clang/heap/free.c b/clang/heap/free.c new file mode 100644 index 00000000..67d668fb --- /dev/null +++ b/clang/heap/free.c @@ -0,0 +1,8 @@ +// heap/free.c — Stub for removed free-list allocator +// +// The per-thread free-list was replaced by the epoch nursery system. +// heap_free_reset() is called by epoch_reset() for backward compatibility. + +fn void heap_free_reset(void) { + // No-op: free lists no longer exist +} diff --git a/clang/hvm4.c b/clang/hvm4.c index 9f9494d0..cbdcc412 100644 --- a/clang/hvm4.c +++ b/clang/hvm4.c @@ -285,12 +285,14 @@ static int PARSE_FORK_SIDE = -1; // -1 = off, 0 = left branch (DP0), 1 = // Heap // ==== +#include "heap/free.c" #include "heap/alloc.c" #include "heap/read.c" #include "heap/take.c" #include "heap/set.c" #include "heap/set_rel.c" #include "heap/init_slices.c" +#include "heap/epoch.c" // Term Constructors // ================= @@ -367,6 +369,7 @@ static int PARSE_FORK_SIDE = -1; // -1 = off, 0 = left branch (DP0), 1 = #include "prim/fn/log_go_0.c" #include "prim/fn/log_go_1.c" #include "prim/fn/log_go_2.c" +#include "prim/fn/compact.c" #include "prim/init.c" #include "print/term.c" diff --git a/clang/main.c b/clang/main.c index c5d2b5f4..94f8a4e4 100644 --- a/clang/main.c +++ b/clang/main.c @@ -4,13 +4,16 @@ // This file provides the command-line interface for the HVM4 runtime, // mirroring the structure of main.hs for the Haskell implementation. // -// Usage: ./main [-s] [-S] [-D] [-C[N]] [-T] +// Usage: ./main [-s] [-S] [-D] [-C[N]] [-T] [--epoch[=N]] // -s: Show statistics (interactions, time, performance) // -S: Silent output (omit term printing) // -D: Step-by-step reduction (print intermediate terms) // -C: Collapse and flatten (enumerate all superposition branches) // -CN: Collapse and flatten, limit to N results // -T: Use N threads (e.g. -T4) +// --epoch: Enable epoch nursery allocator (stable = 1/4 of HEAP) +// --epoch=N: Enable epoch mode with stable = 1/N of HEAP +// --epoch-bench: Run epoch allocator microbenchmarks #include "hvm4.c" @@ -32,6 +35,8 @@ typedef struct { int debug; int step_by_step; int threads; + int epoch; // 0 = off, >0 = stable fraction (e.g. 4 = 1/4 HEAP) + int epoch_bench; u32 ffi_loads_len; FfiLoad ffi_loads[FFI_MAX]; char *file; @@ -46,6 +51,8 @@ fn CliOpts parse_opts(int argc, char **argv) { .debug = 0, .step_by_step = 0, .threads = 0, + .epoch = 0, + .epoch_bench = 0, .ffi_loads_len = 0, .file = NULL }; @@ -74,6 +81,16 @@ fn CliOpts parse_opts(int argc, char **argv) { fprintf(stderr, "Error: -T value (%d) exceeds MAX_THREADS (%d)\n", opts.threads, MAX_THREADS); exit(1); } + } else if (strcmp(argv[i], "--epoch") == 0) { + opts.epoch = 4; // default: stable = 1/4 of HEAP + } else if (strncmp(argv[i], "--epoch=", 8) == 0) { + opts.epoch = atoi(argv[i] + 8); + if (opts.epoch < 2) { + fprintf(stderr, "Error: --epoch=N requires N >= 2\n"); + exit(1); + } + } else if (strcmp(argv[i], "--epoch-bench") == 0) { + opts.epoch_bench = 1; } else if (strcmp(argv[i], "-d") == 0) { opts.debug = 1; } else if (strcmp(argv[i], "-D") == 0) { @@ -133,6 +150,16 @@ int main(int argc, char **argv) { // Parse command line CliOpts opts = parse_opts(argc, argv); + if (opts.epoch_bench) { + HEAP = calloc(HEAP_CAP, sizeof(Term)); + if (!HEAP) { + sys_error("Memory allocation failed"); + } + thread_set_count(1); + wnf_set_tid(0); + return epoch_bench(); + } + if (opts.file == NULL) { fprintf(stderr, "Usage: ./main [-s] [-S] [-D] [-C[N]] [-T] [--ffi ] [--ffi-dir ]\n"); return 1; @@ -160,7 +187,11 @@ int main(int argc, char **argv) { if (!BOOK || !HEAP || !TABLE) { sys_error("Memory allocation failed"); } - heap_init_slices(); + if (opts.epoch > 0) { + epoch_init((u32)opts.epoch); + } else { + heap_init_slices(); + } // Register known primitives before parsing (needed for arity checks). prim_init(); @@ -245,8 +276,10 @@ int main(int argc, char **argv) { } printf("- Time: %.3f seconds\n", dt); printf("- Perf: %.2f M interactions/s\n", ips / 1e6); + epoch_print_stats(); } else if (opts.silent) { printf("- Itrs: %llu interactions\n", total_itrs); + epoch_print_stats(); } // Cleanup diff --git a/clang/prim/fn/compact.c b/clang/prim/fn/compact.c new file mode 100644 index 00000000..c1c55f5a --- /dev/null +++ b/clang/prim/fn/compact.c @@ -0,0 +1,122 @@ +// @compact(term): Normalize term, then deep-copy the result tree to fresh heap +// positions. In epoch mode, copies to the stable region and resets the nursery, +// reclaiming all evaluation intermediates in O(1). +// +// Without epoch mode, the old tree and evaluation intermediates remain in place +// (some freed to the free list during normalization, the rest as unreferenced +// garbage). This is safe but wastes memory over many rounds. +// +// With epoch mode, the lifecycle is: +// 1. Allocations during normalization go to the nursery (same heap_alloc) +// 2. compact_deep_copy copies the live result to the stable region +// 3. epoch_reset() reclaims the entire nursery in O(threads) pointer rewinds +// +// IC's cycle-freedom guarantees that after SNF, all nursery data except the +// result tree is dead. No tracing or ref-counting needed. + +// Forward declarations (defined later in include order) +fn Term eval_normalize(Term term); + +// Maximum recursion depth for compact_deep_copy. Prevents stack overflow on +// deeply nested SNF trees (e.g., long lists). 64K depth at ~64 bytes/frame +// uses ~4MB stack, well within typical 8MB stack limit. +#define COMPACT_MAX_DEPTH 65536 + +// Deep-copy a fully-normalized term tree to fresh heap positions. +// After eval_normalize, the tree should be pure SNF: constructors, numbers, +// lambdas, SUPs, ERAs, and REFs. No unresolved DP0/DP1 or VARs. +// Handles DP0/DP1/VAR defensively by following resolved substitutions. +// +// In epoch mode: allocates in the stable region via heap_alloc_stable(). +// Without epoch mode: allocates in the caller's nursery via heap_alloc(). +static Term compact_deep_copy_go(Term term, u32 depth) { + if (__builtin_expect(depth > COMPACT_MAX_DEPTH, 0)) { + fprintf(stderr, "compact: tree depth exceeds %u — returning term as-is\n", + COMPACT_MAX_DEPTH); + return term; + } + + u8 tag = term_tag(term); + + // Follow resolved DP0/DP1 and VAR substitutions + while (tag == DP0 || tag == DP1 || tag == VAR) { + u32 loc = term_val(term); + if (loc == 0) break; // Null sentinel — can't read HEAP[0] (spin-waits) + Term cell = heap_read(loc); + if (term_sub_get(cell)) { + term = term_sub_set(cell, 0); + tag = term_tag(term); + } else { + break; // unresolved — copy the cell as-is + } + } + + // Determine number of heap children + u32 nch; + switch (tag) { + case NUM: case ERA: case NAM: case ANY: + case C00: case BJV: case BJ0: case BJ1: + case REF: case F_OP2_NUM: + return term; // no heap children — return as-is + + case DP0: case DP1: case VAR: + nch = 1; break; + + // F_EQL_R stores a heap loc in ext field which we can't relocate here. + // It should not appear in SNF. If it does, copy val-children but note + // the ext-field reference is NOT updated (potential stale pointer in epoch). + case F_EQL_R: + nch = 2; break; + + case ALO: + nch = 0; break; // ALO should not appear in SNF, skip + + case PRI: + nch = prim_arity(term_ext(term)); break; + + default: + nch = TERM_ARITY[tag]; break; + } + + if (nch == 0) return term; + + // In epoch mode, skip copying data that's already in the stable region + u32 old_loc = term_val(term); + if (EPOCH_ENABLED && epoch_is_stable(old_loc)) { + return term; + } + + // Allocate in stable (epoch mode) or nursery (classic mode) + u32 new_loc = (u32)(EPOCH_ENABLED ? heap_alloc_stable(nch) : heap_alloc(nch)); + for (u32 i = 0; i < nch; i++) { + heap_set(new_loc + i, compact_deep_copy_go(heap_read(old_loc + i), depth + 1)); + } + return term_new(term_sub_get(term), tag, term_ext(term), new_loc); +} + +static Term compact_deep_copy(Term term) { + return compact_deep_copy_go(term, 0); +} + +fn Term prim_fn_compact(Term *args) { + if (EPOCH_ENABLED) { + epoch_begin(); + } + + // 1. Normalize the argument to SNF + Term root = eval_normalize(args[0]); + + // 2. Deep-copy the normalized tree to fresh/stable heap positions + Term copy = compact_deep_copy(root); + + if (EPOCH_ENABLED) { + // 3. Reclaim nursery: O(threads) pointer rewinds + epoch_reset(); + } + + return copy; +} + +fn void prim_compact_init(void) { + prim_register("compact", 7, 1, prim_fn_compact); +} diff --git a/clang/prim/init.c b/clang/prim/init.c index f5d1348d..1cf6eb1d 100644 --- a/clang/prim/init.c +++ b/clang/prim/init.c @@ -1,3 +1,4 @@ fn void prim_init(void) { prim_log_init(); + prim_compact_init(); }