From 9afa0bf696ca15b2bca30f4107dc416ee06287cb Mon Sep 17 00:00:00 2001
From: christos chatzifountas <christos.chatzifountas@biotz.io>
Date: Fri, 13 Feb 2026 11:34:36 +0200
Subject: [PATCH 1/6] Add @compact primitive and per-size free-list memory
 recycling

Introduces two features that enable iterative HVM4 programs (like
Bellman-Ford with early termination) to run without exhausting heap:

1. @compact(term): Normalizes term to SNF, then deep-copies the result
   tree to fresh heap positions. Safe to call from any evaluation context
   (inside WNF, nested in expressions) because it never modifies existing
   heap data. Enables iterative algorithms to shed accumulated evaluation
   intermediates between rounds.

2. Per-thread, size-segregated free lists (sizes 1-16): Freed heap blocks
   are recycled via LIFO free lists, checked first by heap_alloc before
   bump-allocating. heap_free calls added to all interaction handlers
   (APP-LAM, DUP-NOD, DUP-SUP, MAT-CTR, MAT-NUM, OP2, EQL, etc.).
   Disabled in multi-threaded mode to avoid cross-thread races.

Also includes OP2-NUM fast path (skip frame push when both operands are
already NUM) and dup_nod split allocation (3 separate allocs instead of
one block, enabling free-list reuse of smaller sizes).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 clang/heap/alloc.c      |  9 +++++
 clang/heap/free.c       | 25 ++++++++++++
 clang/hvm4.c            |  4 ++
 clang/prim/fn/compact.c | 87 +++++++++++++++++++++++++++++++++++++++++
 clang/prim/init.c       |  1 +
 clang/wnf/_.c           | 45 +++++++++++++++++++--
 clang/wnf/app_mat_ctr.c |  3 ++
 clang/wnf/app_mat_num.c |  4 +-
 clang/wnf/app_mat_sup.c |  1 +
 clang/wnf/dup_dry.c     |  1 +
 clang/wnf/dup_lam.c     |  1 +
 clang/wnf/dup_nod.c     | 12 ++++--
 clang/wnf/dup_sup.c     |  2 +
 clang/wnf/op2_sup.c     |  7 +++-
 14 files changed, 192 insertions(+), 10 deletions(-)
 create mode 100644 clang/heap/free.c
 create mode 100644 clang/prim/fn/compact.c

diff --git a/clang/heap/alloc.c b/clang/heap/alloc.c
index 488eb3e8..ab132a70 100644
--- a/clang/heap/alloc.c
+++ b/clang/heap/alloc.c
@@ -1,5 +1,14 @@
 fn u64 heap_alloc(u64 size) {
   u32 tid  = WNF_TID;
+  // Check free list first (sizes 1-8)
+  if (__builtin_expect(size <= FREE_LIST_MAX_SIZE, 1)) {
+    u64 head = FREE_HEAD_AT(tid, size);
+    if (__builtin_expect(head != 0, 0)) {
+      FREE_HEAD_AT(tid, size) = HEAP[head];
+      return head;
+    }
+  }
+  // Bump allocate
   u64 idx  = (u64)tid * HEAP_STRIDE;
   u64 at   = HEAP_NEXT[idx];
   u64 next = at + size;
diff --git a/clang/heap/free.c b/clang/heap/free.c
new file mode 100644
index 00000000..60bd0445
--- /dev/null
+++ b/clang/heap/free.c
@@ -0,0 +1,25 @@
+// Per-thread, size-segregated free lists for heap memory recycling.
+// Each freed block stores a "next" pointer in HEAP[loc].
+// Free list heads are per-thread to avoid contention.
+
+#define FREE_LIST_MAX_SIZE 16
+#define FREE_STRIDE 16  // 16 u64s = 128 bytes per thread (cache-line aligned)
+
+static u64 FREE_HEADS[MAX_THREADS * FREE_STRIDE] __attribute__((aligned(128))) = {0};
+
+#define FREE_HEAD_AT(tid, sz) FREE_HEADS[(u64)(tid) * FREE_STRIDE + (sz) - 1]
+
+fn void heap_free(u64 loc, u64 size) {
+  // Disabled in multi-threaded mode: cross-thread DUP interactions can free
+  // blocks from another thread's heap slice, causing race conditions.
+  // In single-threaded mode, the free list is safe and improves memory reuse.
+  if (__builtin_expect(THREAD_COUNT > 1, 0)) return;
+  if (__builtin_expect(size == 0 || size > FREE_LIST_MAX_SIZE, 0)) return;
+  u32 tid = WNF_TID;
+  HEAP[loc] = FREE_HEAD_AT(tid, size);
+  FREE_HEAD_AT(tid, size) = loc;
+}
+
+fn void heap_free_reset(void) {
+  memset(FREE_HEADS, 0, sizeof(FREE_HEADS));
+}
diff --git a/clang/hvm4.c b/clang/hvm4.c
index 46b187ec..4fbf21c3 100644
--- a/clang/hvm4.c
+++ b/clang/hvm4.c
@@ -169,7 +169,9 @@ typedef struct {
 // Capacities
 // ==========
 
+#ifndef HEAP_CAP
 #define HEAP_CAP (1ULL << 32)
+#endif
 #define BOOK_CAP (1ULL << 24)
 #define WNF_CAP  (1ULL << 32)
 #define MAX_THREADS 64
@@ -285,6 +287,7 @@ static int    PARSE_FORK_SIDE = -1;      // -1 = off, 0 = left branch (DP0), 1 =
 // Heap
 // ====
 
+#include "heap/free.c"
 #include "heap/alloc.c"
 #include "heap/read.c"
 #include "heap/take.c"
@@ -366,6 +369,7 @@ static int    PARSE_FORK_SIDE = -1;      // -1 = off, 0 = left branch (DP0), 1 =
 #include "prim/fn/log_go_0.c"
 #include "prim/fn/log_go_1.c"
 #include "prim/fn/log_go_2.c"
+#include "prim/fn/compact.c"
 #include "prim/init.c"
 #include "print/term.c"
 
diff --git a/clang/prim/fn/compact.c b/clang/prim/fn/compact.c
new file mode 100644
index 00000000..fd38f66e
--- /dev/null
+++ b/clang/prim/fn/compact.c
@@ -0,0 +1,87 @@
+// @compact(term): Normalize term, then deep-copy the result tree to fresh heap
+// positions. The old tree and evaluation intermediates remain in place (some
+// freed to the free list during normalization, the rest as unreferenced garbage).
+//
+// This approach is safe to call from ANY position in the evaluation — inside
+// WNF, inside eval_normalize, nested in expressions, etc. — because it never
+// modifies or resets existing heap data. It only allocates new heap space.
+//
+// For the Bellman-Ford use case: each round produces ~O(tree_size) live data.
+// The evaluation intermediates are mostly freed to the free list and reused.
+// The deep copy adds ~tree_size new words. Over R rounds with tree size T,
+// total extra heap usage is O(R * T), which is modest for practical graphs.
+//
+// TODO: For very large problems (1000+ nodes, 100+ rounds), implement a proper
+// mark-compact GC with full root set discovery (including eval_normalize state
+// and WNF work queues) to reclaim dead space.
+
+// Forward declarations (defined later in include order)
+fn Term eval_normalize(Term term);
+
+// Deep-copy a fully-normalized term tree to fresh heap positions.
+// After eval_normalize, the tree should be pure SNF: constructors, numbers,
+// lambdas, SUPs, ERAs, and REFs. No unresolved DP0/DP1 or VARs.
+// Handles DP0/DP1/VAR defensively by following resolved substitutions.
+static Term compact_deep_copy(Term term) {
+  u8 tag = term_tag(term);
+
+  // Follow resolved DP0/DP1 and VAR substitutions
+  while (tag == DP0 || tag == DP1 || tag == VAR) {
+    u32 loc = term_val(term);
+    Term cell = heap_read(loc);
+    if (term_sub_get(cell)) {
+      term = term_sub_set(cell, 0);
+      tag = term_tag(term);
+    } else {
+      break; // unresolved — copy the cell as-is
+    }
+  }
+
+  // Determine number of heap children
+  u32 nch;
+  switch (tag) {
+    case NUM: case ERA: case NAM: case ANY:
+    case C00: case BJV: case BJ0: case BJ1:
+    case REF: case F_OP2_NUM:
+      return term; // no heap children — return as-is
+
+    case DP0: case DP1: case VAR:
+      nch = 1; break;
+
+    case F_EQL_R:
+      nch = 2; break;
+
+    case ALO:
+      nch = 0; break; // ALO should not appear in SNF, skip
+
+    case PRI:
+      nch = prim_arity(term_ext(term)); break;
+
+    default:
+      nch = TERM_ARITY[tag]; break;
+  }
+
+  if (nch == 0) return term;
+
+  // Recursively copy children to fresh heap locations
+  u32 old_loc = term_val(term);
+  u32 new_loc = (u32)heap_alloc(nch);
+  for (u32 i = 0; i < nch; i++) {
+    heap_set(new_loc + i, compact_deep_copy(heap_read(old_loc + i)));
+  }
+  return term_new(term_sub_get(term), tag, term_ext(term), new_loc);
+}
+
+fn Term prim_fn_compact(Term *args) {
+  // 1. Normalize the argument to SNF
+  Term root = eval_normalize(args[0]);
+
+  // 2. Deep-copy the normalized tree to fresh heap positions
+  Term copy = compact_deep_copy(root);
+
+  return copy;
+}
+
+fn void prim_compact_init(void) {
+  prim_register("compact", 7, 1, prim_fn_compact);
+}
diff --git a/clang/prim/init.c b/clang/prim/init.c
index f5d1348d..1cf6eb1d 100644
--- a/clang/prim/init.c
+++ b/clang/prim/init.c
@@ -1,3 +1,4 @@
 fn void prim_init(void) {
   prim_log_init();
+  prim_compact_init();
 }
diff --git a/clang/wnf/_.c b/clang/wnf/_.c
index 957a9e06..2e506476 100644
--- a/clang/wnf/_.c
+++ b/clang/wnf/_.c
@@ -94,6 +94,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
         u32 loc = term_val(next);
         Term cell = heap_read(loc);
         if (term_sub_get(cell)) {
+          heap_free(loc, 1);
           next = term_sub_set(cell, 0);
           goto enter;
         }
@@ -106,6 +107,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
         u32 loc = term_val(next);
         Term cell = heap_take(loc);
         if (term_sub_get(cell)) {
+          heap_free(loc, 1);
           next = term_sub_set(cell, 0);
           goto enter;
         }
@@ -125,6 +127,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
       case DUP: {
         u32  loc  = term_val(next);
         Term body = heap_read(loc + 1);
+        heap_free(loc + 1, 1);
         next = body;
         goto enter;
       }
@@ -155,6 +158,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
       case ALO: {
         u32  alo_loc = term_val(next);
         u64  pair    = heap_read(alo_loc);
+        heap_free(alo_loc, 1);
         u32  tm_loc  = (u32)(pair & 0xFFFFFFFF);
         u32  ls_loc  = (u32)(pair >> 32);
         u32  len     = term_ext(next);
@@ -231,6 +235,22 @@ __attribute__((hot)) fn Term wnf(Term term) {
       case OP2: {
         u32  loc = term_val(next);
         Term x   = heap_read(loc + 0);
+        // Fast path: both operands already NUM — skip frame push/pop
+        if (__builtin_expect(term_tag(x) == NUM, 1)) {
+          Term y = heap_read(loc + 1);
+          if (__builtin_expect(term_tag(y) == NUM, 1)) {
+            u32 opr = term_ext(next);
+            heap_free(loc, 2);
+            whnf = wnf_op2_num_num_raw(opr, term_val(x), term_val(y));
+            goto apply;
+          }
+          // x is NUM, y needs reduction — skip OP2 frame, go straight to F_OP2_NUM
+          u32 opr = term_ext(next);
+          heap_free(loc, 2);
+          stack[s_pos++] = term_new(0, F_OP2_NUM, opr, term_val(x));
+          next = y;
+          goto enter;
+        }
         stack[s_pos++] = next;
         next = x;
         goto enter;
@@ -325,6 +345,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
 
           switch (term_tag(whnf)) {
             case ERA: {
+              heap_free(app_loc, 2);
               whnf = wnf_app_era();
               continue;
             }
@@ -332,32 +353,39 @@ __attribute__((hot)) fn Term wnf(Term term) {
             case BJV:
             case BJ0:
             case BJ1: {
+              heap_free(app_loc, 2);
               whnf = wnf_app_nam(whnf, arg);
               continue;
             }
             case DRY: {
+              heap_free(app_loc, 2);
               whnf = wnf_app_dry(whnf, arg);
               continue;
             }
             case LAM: {
+              heap_free(app_loc, 2);
               next = wnf_app_lam(whnf, arg);
               goto enter;
             }
             case SUP: {
+              // NO free: app_sup reuses app_loc in-place
               whnf = wnf_app_sup(frame, whnf);
               continue;
             }
             case INC: {
+              // NO free: app_inc reuses app_loc in-place
               whnf = wnf_app_inc(frame, whnf);
               continue;
             }
             case MAT:
             case SWI: {
+              heap_free(app_loc, 2);
               stack[s_pos++] = whnf;
               next = arg;
               goto enter;
             }
             case USE: {
+              heap_free(app_loc, 2);
               stack[s_pos++] = whnf;
               next = arg;
               goto enter;
@@ -371,7 +399,8 @@ __attribute__((hot)) fn Term wnf(Term term) {
               exit(1);
             }
             default: {
-              whnf = term_new_app(whnf, arg);
+              // Rewrite APP in-place instead of allocating new
+              whnf = term_new_app_at(app_loc, whnf, arg);
               continue;
             }
           }
@@ -385,6 +414,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
           Term mat = frame;
           switch (term_tag(whnf)) {
             case ERA: {
+              heap_free(term_val(mat), 2);
               whnf = wnf_app_era();
               continue;
             }
@@ -511,6 +541,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
           u32  opr = term_ext(frame);
           u32  loc = term_val(frame);
           Term y   = heap_read(loc + 1);
+          heap_free(loc, 2);
 
           switch (term_tag(whnf)) {
             case ERA: {
@@ -587,24 +618,27 @@ __attribute__((hot)) fn Term wnf(Term term) {
 
           switch (term_tag(whnf)) {
             case ERA: {
+              heap_free(loc, 2);
               whnf = wnf_eql_era_l();
               continue;
             }
             case ANY: {
+              heap_free(loc, 2);
               whnf = wnf_eql_any_l();
               continue;
             }
             case SUP: {
+              heap_free(loc, 2);
               whnf = wnf_eql_sup_l(whnf, b);
               continue;
             }
             case INC: {
+              heap_free(loc, 2);
               whnf = wnf_eql_inc_l(whnf, b);
               continue;
             }
             default: {
-              // Store a's WHNF location, push F_EQL_R, enter b
-              // We store a in heap_read(loc+0) for later retrieval
+              // EQL reused: loc+0 stores a's WHNF for F_EQL_R phase
               heap_set(loc + 0, whnf);
               stack[s_pos++] = term_new(0, F_EQL_R, 0, loc);
               next = b;
@@ -619,6 +653,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
         case F_EQL_R: {
           u32  loc = term_val(frame);
           Term a   = heap_read(loc + 0);  // a's WHNF was stored here
+          heap_free(loc, 2);
 
           switch (term_tag(whnf)) {
             case ERA: {
@@ -698,6 +733,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
           u32  loc = term_val(frame);
           Term a   = heap_read(loc + 1);
           Term b   = heap_read(loc + 2);
+          heap_free(loc, 3);
 
           switch (term_tag(whnf)) {
             case ERA: {
@@ -730,6 +766,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
           u32  loc = term_val(frame);
           Term val = heap_read(loc + 1);
           Term bod = heap_read(loc + 2);
+          heap_free(loc, 3);
 
           switch (term_tag(whnf)) {
             case ERA: {
@@ -761,6 +798,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
         case AND: {
           u32  loc = term_val(frame);
           Term b   = heap_read(loc + 1);
+          heap_free(loc, 2);
 
           switch (term_tag(whnf)) {
             case ERA: {
@@ -792,6 +830,7 @@ __attribute__((hot)) fn Term wnf(Term term) {
         case OR: {
           u32  loc = term_val(frame);
           Term b   = heap_read(loc + 1);
+          heap_free(loc, 2);
 
           switch (term_tag(whnf)) {
             case ERA: {
diff --git a/clang/wnf/app_mat_ctr.c b/clang/wnf/app_mat_ctr.c
index 2cf2902f..70e370de 100644
--- a/clang/wnf/app_mat_ctr.c
+++ b/clang/wnf/app_mat_ctr.c
@@ -14,6 +14,7 @@ fn Term wnf_app_mat_ctr(Term mat, Term ctr) {
     u32 ari = term_tag(ctr) - C00;
     Term res = heap_read(mat_loc);
     if (ari == 0) {
+      heap_free(mat_loc, 2);
       return res;
     }
     u32 ctr_loc = term_val(ctr);
@@ -21,6 +22,8 @@ fn Term wnf_app_mat_ctr(Term mat, Term ctr) {
     for (u32 i = 0; i < ari; i++) {
       res = term_new_app_at((u32)(apps + 2 * (u64)i), res, heap_read(ctr_loc + i));
     }
+    heap_free(mat_loc, 2);
+    heap_free(ctr_loc, ari);
     return res;
   } else {
     ITRS_INC("APP-MAT-CTR-MIS");
diff --git a/clang/wnf/app_mat_num.c b/clang/wnf/app_mat_num.c
index cbc3bb43..43c224c2 100644
--- a/clang/wnf/app_mat_num.c
+++ b/clang/wnf/app_mat_num.c
@@ -11,7 +11,9 @@ fn Term wnf_app_mat_num(Term mat, Term num) {
   u32 num_val = term_val(num);
   if (mat_ext == num_val) {
     ITRS_INC("APP-MAT-NUM-MAT");
-    return heap_read(mat_loc + 0);
+    Term res = heap_read(mat_loc + 0);
+    heap_free(mat_loc, 2);
+    return res;
   } else {
     ITRS_INC("APP-MAT-NUM-MIS");
     Term g = heap_read(mat_loc + 1);
diff --git a/clang/wnf/app_mat_sup.c b/clang/wnf/app_mat_sup.c
index c960116d..35ef8351 100644
--- a/clang/wnf/app_mat_sup.c
+++ b/clang/wnf/app_mat_sup.c
@@ -11,5 +11,6 @@ fn Term wnf_app_mat_sup(Term mat, Term sup) {
   u32  loc = term_val(sup);
   Term a   = heap_read(loc + 0);
   Term b   = heap_read(loc + 1);
+  heap_free(loc, 2);
   return term_new_sup(lab, term_new_app(M.k0, a), term_new_app(M.k1, b));
 }
diff --git a/clang/wnf/dup_dry.c b/clang/wnf/dup_dry.c
index 008ce76c..e7505d08 100644
--- a/clang/wnf/dup_dry.c
+++ b/clang/wnf/dup_dry.c
@@ -11,6 +11,7 @@ fn Term wnf_dup_dry(u32 lab, u32 loc, u8 side, Term dry) {
   u32  at    = (u32)base;
   heap_set(at + 0, heap_read(d_loc + 0));
   heap_set(at + 1, heap_read(d_loc + 1));
+  heap_free(d_loc, 2);
   Copy F     = term_clone_at(at + 0, lab);
   Copy A     = term_clone_at(at + 1, lab);
   Term r0    = term_new_dry_at(at + 2, F.k0, A.k0);
diff --git a/clang/wnf/dup_lam.c b/clang/wnf/dup_lam.c
index 16894eea..e80213b1 100644
--- a/clang/wnf/dup_lam.c
+++ b/clang/wnf/dup_lam.c
@@ -11,6 +11,7 @@ fn Term wnf_dup_lam(u32 lab, u32 loc, u8 side, Term lam) {
   Term bod            = heap_read(lam_loc);
 
   if (lam_ext & LAM_ERA_MASK) {
+    heap_free(lam_loc, 1);
     u64  a      = heap_alloc(3);
     heap_set(a + 2, bod);
     Copy B      = term_clone_at(a + 2, lab);
diff --git a/clang/wnf/dup_nod.c b/clang/wnf/dup_nod.c
index 6e80e704..fb4e1109 100644
--- a/clang/wnf/dup_nod.c
+++ b/clang/wnf/dup_nod.c
@@ -15,16 +15,20 @@ fn Term wnf_dup_nod(u32 lab, u32 loc, u8 side, Term term) {
   u32  t_loc = term_val(term);
   u32  t_ext = term_ext(term);
   u8   t_tag = term_tag(term);
-  u64  block = heap_alloc(3 * (u64)ari);
-  u32  vals  = (u32)block;
-  u32  r0_loc = vals + ari;
-  u32  r1_loc = r0_loc + ari;
+  // Split into 3 separate allocations so each can reuse freed blocks
+  u64  vals_blk  = heap_alloc(ari);
+  u64  r0_blk    = heap_alloc(ari);
+  u64  r1_blk    = heap_alloc(ari);
+  u32  vals      = (u32)vals_blk;
+  u32  r0_loc    = (u32)r0_blk;
+  u32  r1_loc    = (u32)r1_blk;
   for (u32 i = 0; i < ari; i++) {
     heap_set(vals + i, heap_read(t_loc + i));
     Copy A = term_clone_at(vals + i, lab);
     heap_set(r0_loc + i, A.k0);
     heap_set(r1_loc + i, A.k1);
   }
+  heap_free(t_loc, ari);
   Term r0 = term_new(0, t_tag, t_ext, r0_loc);
   Term r1 = term_new(0, t_tag, t_ext, r1_loc);
   return heap_subst_cop(side, loc, r0, r1);
diff --git a/clang/wnf/dup_sup.c b/clang/wnf/dup_sup.c
index 8cbf6311..7b7b4562 100644
--- a/clang/wnf/dup_sup.c
+++ b/clang/wnf/dup_sup.c
@@ -15,12 +15,14 @@ fn Term wnf_dup_sup(u32 lab, u32 loc, u8 side, Term sup) {
   if (lab == sup_lab) {
     Term tm0 = heap_read(sup_loc + 0);
     Term tm1 = heap_read(sup_loc + 1);
+    heap_free(sup_loc, 2);
     return heap_subst_cop(side, loc, tm0, tm1);
   } else {
     u64 base = heap_alloc(6);
     u32 at   = (u32)base;
     heap_set(at + 0, heap_read(sup_loc + 0));
     heap_set(at + 1, heap_read(sup_loc + 1));
+    heap_free(sup_loc, 2);
     Copy A  = term_clone_at(at + 0, lab);
     Copy B  = term_clone_at(at + 1, lab);
     Term s0 = term_new_sup_at(at + 2, sup_lab, A.k0, B.k0);
diff --git a/clang/wnf/op2_sup.c b/clang/wnf/op2_sup.c
index 5bbc398f..b646e09d 100644
--- a/clang/wnf/op2_sup.c
+++ b/clang/wnf/op2_sup.c
@@ -7,7 +7,10 @@ fn Term wnf_op2_sup(u32 opr, Term sup, Term y) {
   u32  lab     = term_ext(sup);
   u32  sup_loc = term_val(sup);
   Copy Y       = term_clone(lab, y);
-  Term op0     = term_new_op2(opr, heap_read(sup_loc + 0), Y.k0);
-  Term op1     = term_new_op2(opr, heap_read(sup_loc + 1), Y.k1);
+  Term a       = heap_read(sup_loc + 0);
+  Term b       = heap_read(sup_loc + 1);
+  heap_free(sup_loc, 2);
+  Term op0     = term_new_op2(opr, a, Y.k0);
+  Term op1     = term_new_op2(opr, b, Y.k1);
   return term_new_sup(lab, op0, op1);
 }

From 836fd1cc78fdfd062264470fea42fa9bb5bbb26c Mon Sep 17 00:00:00 2001
From: christos chatzifountas <christos.chatzifountas@biotz.io>
Date: Fri, 13 Feb 2026 13:29:17 +0200
Subject: [PATCH 2/6] Add post-parse hints, resizable work queues, and elastic
 ring buffer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Analyze BOOK terms after parsing to compute program hints (node count,
SUP/DUP presence, max depth, arity stats). Use hints to right-size all
runtime buffers instead of hardcoded compile-time capacities — reducing
memory footprint from GBs to KBs for small programs while still
handling large ones via dynamic growth.

Key changes:
- analyze/hints.c: O(N) post-parse analysis producing HvmHints
- data/wsq.c: resizable Chase-Lev deque (WsqArray indirection, 2x growth)
- data/wspq.c: dynamic bracket count and per-bucket capacity
- data/uset.c: sized initialization from hints instead of HEAP_CAP
- data/elastic_ring.c: ouroboros double-map ring buffer with elastic
  growth/shrink via memfd_create (Linux) / shm_open (macOS)
- eval/normalize.c, eval/collapse.c, cnf/_.c: hint-based buffer sizing
- main.c: -v flag for hints output, --test-ring for ring self-test

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 clang/analyze/hints.c     | 123 +++++++++++
 clang/cnf/_.c             |   8 +-
 clang/data/elastic_ring.c | 431 ++++++++++++++++++++++++++++++++++++++
 clang/data/uset.c         |  17 +-
 clang/data/wspq.c         |  31 ++-
 clang/data/wsq.c          | 121 ++++++++---
 clang/eval/collapse.c     |  24 ++-
 clang/eval/normalize.c    |  19 +-
 clang/hvm4.c              |   8 +
 clang/main.c              |  23 +-
 10 files changed, 752 insertions(+), 53 deletions(-)
 create mode 100644 clang/analyze/hints.c
 create mode 100644 clang/data/elastic_ring.c

diff --git a/clang/analyze/hints.c b/clang/analyze/hints.c
new file mode 100644
index 00000000..f22cab08
--- /dev/null
+++ b/clang/analyze/hints.c
@@ -0,0 +1,123 @@
+// analyze/hints.c - Post-parse program analysis for buffer sizing.
+//
+// Context
+// - Runs after parse_def() completes, before evaluation begins.
+// - Scans the static BOOK/HEAP to compute program size hints.
+// - Hints are used to right-size runtime buffers (queues, uset, wspq).
+//
+// Design
+// - Linear scan of static heap for tag statistics: O(N) where N = static terms.
+// - Tree walk per BOOK entry for max depth measurement.
+// - All data is read-only after parsing; no synchronization needed.
+
+typedef struct {
+  u64 node_count;      // Total term words in static heap
+  u32 def_count;       // Number of @-definitions (TABLE_LEN)
+  u32 max_arity;       // Largest constructor arity seen (C00-C16)
+  u32 dup_count;       // DUP nodes found
+  u32 sup_count;       // SUP nodes found
+  u32 max_depth;       // Deepest term tree across all definitions
+  u64 static_heap;     // Heap words used by static definitions
+  u8  has_sup;         // 1 if any SUP exists
+  u8  has_pri;         // 1 if any PRI exists
+} HvmHints;
+
+// Compute the smallest power-of-two exponent >= val, with min/max bounds.
+fn u32 hints_cap_pow2(u64 val, u32 min_pow2, u32 max_pow2) {
+  u32 p = min_pow2;
+  while ((1ULL << p) < val && p < max_pow2) p++;
+  return p;
+}
+
+fn HvmHints hvm_analyze(void) {
+  HvmHints h = {0};
+  h.def_count    = TABLE_LEN;
+  h.static_heap  = HEAP_NEXT_AT(0);
+  h.node_count   = h.static_heap > 1 ? h.static_heap - 1 : 0;
+
+  // Linear scan of static heap for tag statistics.
+  for (u64 i = 1; i < h.static_heap; i++) {
+    Term t  = HEAP[i];
+    u8  tag = term_tag(t);
+    if (tag == DUP) h.dup_count++;
+    if (tag == SUP) { h.sup_count++; h.has_sup = 1; }
+    if (tag == PRI) h.has_pri = 1;
+    if (tag >= C00 && tag <= C16) {
+      u32 ari = tag - C00;
+      if (ari > h.max_arity) h.max_arity = ari;
+    }
+  }
+
+  // Tree walk per definition for max depth.
+  // Stack-based iterative DFS to avoid deep recursion.
+  #define HINTS_WALK_STACK 4096
+  u32 walk_loc[HINTS_WALK_STACK];
+  u32 walk_dep[HINTS_WALK_STACK];
+
+  for (u32 id = 0; id < TABLE_LEN; id++) {
+    if (BOOK[id] == 0) continue;
+    u32 sp = 0;
+    walk_loc[sp] = BOOK[id];
+    walk_dep[sp] = 0;
+    sp++;
+
+    while (sp > 0) {
+      sp--;
+      u32 loc   = walk_loc[sp];
+      u32 depth = walk_dep[sp];
+      if (depth > h.max_depth) h.max_depth = depth;
+
+      if (loc == 0 || loc >= h.static_heap) continue;
+
+      Term t   = HEAP[loc];
+      u8   tag = term_tag(t);
+      u32  val = term_val(t);
+
+      // Only recurse into children of compound nodes.
+      // DP0/DP1/VAR/ALO/REF/NUM/ERA etc. have arity 0 → no children.
+      u32 ari = TERM_ARITY[tag];
+      if (tag == PRI) ari = 0; // can't determine statically
+
+      for (u32 i = 0; i < ari && sp < HINTS_WALK_STACK; i++) {
+        walk_loc[sp] = val + i;
+        walk_dep[sp] = depth + 1;
+        sp++;
+      }
+    }
+  }
+  #undef HINTS_WALK_STACK
+
+  return h;
+}
+
+// Print hints summary to stderr (used by -v flag).
+fn void hvm_hints_print(HvmHints *h) {
+  fprintf(stderr, "[hints] defs=%u nodes=%llu max_arity=%u dups=%u sups=%u depth=%u static_heap=%llu\n",
+    h->def_count,
+    (unsigned long long)h->node_count,
+    h->max_arity,
+    h->dup_count,
+    h->sup_count,
+    h->max_depth,
+    (unsigned long long)h->static_heap);
+
+  // Compute and display buffer sizing decisions.
+  u32 norm_pow2 = hints_cap_pow2(h->node_count / 4, 8, 24);
+  u64 uset_locs = h->static_heap * 64;
+  if (uset_locs < 4096) uset_locs = 4096;
+  if (uset_locs > HEAP_CAP) uset_locs = HEAP_CAP;
+  u64 uset_kb = ((uset_locs + 63) >> 6) * 8 / 1024;
+
+  u32 wspq_brackets = h->max_depth + 4;
+  if (wspq_brackets > WSPQ_BRACKETS) wspq_brackets = WSPQ_BRACKETS;
+  if (wspq_brackets < 4) wspq_brackets = 4;
+
+  fprintf(stderr, "[hints] normalize_queue=2^%u uset=%lluKB", norm_pow2, (unsigned long long)uset_kb);
+  if (!h->has_sup) {
+    fprintf(stderr, " collapse=minimal(no SUPs)");
+  } else {
+    u32 col_pow2 = hints_cap_pow2(h->sup_count * 4, 8, 24);
+    fprintf(stderr, " collapse_queue=2^%u brackets=%u", col_pow2, wspq_brackets);
+  }
+  fprintf(stderr, "\n");
+}
diff --git a/clang/cnf/_.c b/clang/cnf/_.c
index b3001f53..8d41e126 100644
--- a/clang/cnf/_.c
+++ b/clang/cnf/_.c
@@ -40,11 +40,11 @@ fn void cnf_pool_clear(void) {
   atomic_store_explicit(&CNF_POOL, NULL, memory_order_release);
 }
 
-fn u8 cnf_pool_init(CnfPool *pool, u32 n) {
+fn u8 cnf_pool_init_sized(CnfPool *pool, u32 n, u32 cap_pow2) {
   pool->n = n;
   atomic_store_explicit(&pool->pending.v, n > 1 ? n : 0, memory_order_relaxed);
   for (u32 i = 0; i < n; ++i) {
-    if (!wsq_init(&pool->dq[i], CNF_POOL_WS_CAP_POW2)) {
+    if (!wsq_init(&pool->dq[i], cap_pow2)) {
       for (u32 j = 0; j < i; ++j) {
         wsq_free(&pool->dq[j]);
       }
@@ -56,6 +56,10 @@ fn u8 cnf_pool_init(CnfPool *pool, u32 n) {
   return 1;
 }
 
+fn u8 cnf_pool_init(CnfPool *pool, u32 n) {
+  return cnf_pool_init_sized(pool, n, CNF_POOL_WS_CAP_POW2);
+}
+
 fn void cnf_pool_free(CnfPool *pool) {
   for (u32 i = 0; i < pool->n; ++i) {
     wsq_free(&pool->dq[i]);
diff --git a/clang/data/elastic_ring.c b/clang/data/elastic_ring.c
new file mode 100644
index 00000000..90f05674
--- /dev/null
+++ b/clang/data/elastic_ring.c
@@ -0,0 +1,431 @@
+// data/elastic_ring.c - Elastic Cyclic Ouroboros Buffer.
+//
+// Context
+// - General-purpose ring buffer with zero-copy wrap-around via double-mapping.
+// - Same physical memory is mapped twice contiguously in virtual address space,
+//   so reads/writes that cross the buffer boundary are seamless (no split logic).
+// - Elastic: grows via ftruncate + remap, shrinks via ftruncate + remap.
+//   Data is preserved through the backing fd; growth copies only live data.
+//
+// Design
+// - Linux: memfd_create for anonymous backing fd.
+// - macOS/POSIX: shm_open + immediate shm_unlink for anonymous backing fd.
+// - Double-map: reserve 2*cap virtual space, MAP_FIXED both halves to same fd.
+// - Modular head/tail indices in [0, cap). Separate count for full/empty.
+// - Growth: save live data (contiguous via double-map), extend fd, remap, restore.
+// - Shrink: same approach in reverse, halving capacity.
+//
+// Notes
+// - Single-threaded (owner only). Concurrency is handled at a higher layer.
+// - Capacity is always a power-of-two multiple of page size.
+// - ring_push_ptr / ring_pop_ptr return pointers valid for contiguous access
+//   up to (cap - count) and count bytes respectively, even across the boundary.
+
+#include <sys/mman.h>
+#include <unistd.h>
+#include <string.h>
+#include <stdbool.h>
+
+#if defined(__APPLE__)
+#include <fcntl.h>
+#endif
+
+// ============================================================
+// Platform backend
+// ============================================================
+
+#if defined(__linux__)
+#define ERING_HAS_MEMFD 1
+#include <sys/syscall.h>
+#ifndef MFD_CLOEXEC
+#define MFD_CLOEXEC 0x0001U
+#endif
+#elif defined(__APPLE__) || _POSIX_SHARED_MEMORY_OBJECTS > 0
+#define ERING_HAS_SHM 1
+#else
+#error "elastic_ring requires memfd_create (Linux) or shm_open (macOS/POSIX)"
+#endif
+
+// Create an anonymous file descriptor for backing memory.
+static int ering_create_fd(void) {
+#if defined(ERING_HAS_MEMFD)
+  return (int)syscall(SYS_memfd_create, "ering", MFD_CLOEXEC);
+#elif defined(ERING_HAS_SHM)
+  static _Atomic u32 ering_shm_id = 0;
+  char name[64];
+  u32 id = atomic_fetch_add_explicit(&ering_shm_id, 1, memory_order_relaxed);
+  snprintf(name, sizeof(name), "/ering_%d_%u", getpid(), id);
+  int fd = shm_open(name, O_RDWR | O_CREAT | O_EXCL, 0600);
+  if (fd >= 0) shm_unlink(name);
+  return fd;
+#endif
+}
+
+// Map the same fd at two contiguous virtual regions for seamless wrap-around.
+// Returns base address of the 2*cap virtual region, or NULL on failure.
+static void *ering_double_map(int fd, size_t cap) {
+  // Reserve 2*cap contiguous virtual address space.
+  void *base = mmap(NULL, 2 * cap, PROT_NONE,
+                    MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  if (base == MAP_FAILED) return NULL;
+
+  // Map first half: [base, base+cap) → fd[0, cap).
+  void *p1 = mmap(base, cap, PROT_READ | PROT_WRITE,
+                  MAP_SHARED | MAP_FIXED, fd, 0);
+  if (p1 == MAP_FAILED) {
+    munmap(base, 2 * cap);
+    return NULL;
+  }
+
+  // Map second half: [base+cap, base+2*cap) → fd[0, cap) (same pages).
+  void *p2 = mmap((char *)base + cap, cap, PROT_READ | PROT_WRITE,
+                  MAP_SHARED | MAP_FIXED, fd, 0);
+  if (p2 == MAP_FAILED) {
+    munmap(base, 2 * cap);
+    return NULL;
+  }
+
+  return base;
+}
+
+// ============================================================
+// ElasticRing type
+// ============================================================
+
+typedef struct {
+  u8    *data;   // double-mapped virtual region (size = 2 * cap)
+  size_t cap;    // physical capacity in bytes (page-aligned, power of 2)
+  size_t mask;   // cap - 1
+  size_t head;   // write position in [0, cap)
+  size_t tail;   // read position in [0, cap)
+  size_t count;  // live bytes in buffer
+  int    fd;     // backing fd
+  size_t pg;     // system page size
+} ElasticRing;
+
+// ============================================================
+// Helpers
+// ============================================================
+
+static size_t ering_page_size(void) {
+  long sz = sysconf(_SC_PAGESIZE);
+  return (sz > 0) ? (size_t)sz : 4096;
+}
+
+// Round up to the next power of two >= pg.
+static size_t ering_round_cap(size_t requested, size_t pg) {
+  size_t cap = pg;
+  while (cap < requested) cap *= 2;
+  return cap;
+}
+
+fn size_t ring_used(ElasticRing *r)  { return r->count; }
+fn size_t ring_avail(ElasticRing *r) { return r->cap - r->count; }
+fn size_t ring_capacity(ElasticRing *r) { return r->cap; }
+
+// ============================================================
+// Init / Free
+// ============================================================
+
+fn bool ring_init(ElasticRing *r, size_t initial_cap) {
+  memset(r, 0, sizeof(*r));
+  r->pg = ering_page_size();
+  size_t cap = ering_round_cap(initial_cap < r->pg ? r->pg : initial_cap, r->pg);
+
+  r->fd = ering_create_fd();
+  if (r->fd < 0) return false;
+
+  if (ftruncate(r->fd, (off_t)cap) != 0) {
+    close(r->fd);
+    r->fd = -1;
+    return false;
+  }
+
+  r->data = (u8 *)ering_double_map(r->fd, cap);
+  if (!r->data) {
+    close(r->fd);
+    r->fd = -1;
+    return false;
+  }
+
+  r->cap  = cap;
+  r->mask = cap - 1;
+  r->head = 0;
+  r->tail = 0;
+  r->count = 0;
+  return true;
+}
+
+fn void ring_free(ElasticRing *r) {
+  if (r->data) {
+    munmap(r->data, 2 * r->cap);
+    r->data = NULL;
+  }
+  if (r->fd >= 0) {
+    close(r->fd);
+    r->fd = -1;
+  }
+  r->cap = r->mask = r->head = r->tail = r->count = 0;
+}
+
+// ============================================================
+// Grow / Shrink
+// ============================================================
+
+// Double the buffer capacity. Live data is preserved.
+// Returns true on success, false on failure (buffer unchanged).
+fn bool ring_grow(ElasticRing *r) {
+  size_t old_cap = r->cap;
+  size_t new_cap = old_cap * 2;
+  size_t used    = r->count;
+
+  // Save live data via double-map (always contiguous from data+tail).
+  u8 *save = NULL;
+  if (used > 0) {
+    save = (u8 *)malloc(used);
+    if (!save) return false;
+    memcpy(save, r->data + r->tail, used);
+  }
+
+  // Extend backing fd.
+  if (ftruncate(r->fd, (off_t)new_cap) != 0) {
+    free(save);
+    return false;
+  }
+
+  // Tear down old double-mapping and create a larger one.
+  munmap(r->data, 2 * old_cap);
+  r->data = (u8 *)ering_double_map(r->fd, new_cap);
+  if (!r->data) {
+    // Attempt rollback.
+    ftruncate(r->fd, (off_t)old_cap);
+    r->data = (u8 *)ering_double_map(r->fd, old_cap);
+    if (!r->data) {
+      fprintf(stderr, "elastic_ring: fatal remap failure during growth\n");
+      exit(1);
+    }
+    if (save) { memcpy(r->data, save, used); free(save); }
+    r->tail = 0;
+    r->head = used;
+    return false;
+  }
+
+  // Restore live data at position 0.
+  if (save) {
+    memcpy(r->data, save, used);
+    free(save);
+  }
+
+  r->cap  = new_cap;
+  r->mask = new_cap - 1;
+  r->tail = 0;
+  r->head = used;
+  return true;
+}
+
+// Halve the buffer if significantly underutilized (count <= cap/4).
+// Best-effort: failure leaves the buffer unchanged.
+fn void ring_shrink(ElasticRing *r) {
+  size_t new_cap = r->cap / 2;
+  if (r->count > new_cap || new_cap < r->pg) return;
+
+  size_t used = r->count;
+  u8 *save = NULL;
+  if (used > 0) {
+    save = (u8 *)malloc(used);
+    if (!save) return;
+    memcpy(save, r->data + r->tail, used);
+  }
+
+  munmap(r->data, 2 * r->cap);
+
+  if (ftruncate(r->fd, (off_t)new_cap) != 0) {
+    // Restore original.
+    ftruncate(r->fd, (off_t)r->cap);
+    r->data = (u8 *)ering_double_map(r->fd, r->cap);
+    if (save) { memcpy(r->data, save, used); free(save); }
+    r->tail = 0;
+    r->head = used;
+    return;
+  }
+
+  r->data = (u8 *)ering_double_map(r->fd, new_cap);
+  if (!r->data) {
+    ftruncate(r->fd, (off_t)r->cap);
+    r->data = (u8 *)ering_double_map(r->fd, r->cap);
+    if (!r->data) {
+      fprintf(stderr, "elastic_ring: fatal remap failure during shrink\n");
+      exit(1);
+    }
+    if (save) { memcpy(r->data, save, used); free(save); }
+    r->tail = 0;
+    r->head = used;
+    return;
+  }
+
+  if (save) {
+    memcpy(r->data, save, used);
+    free(save);
+  }
+
+  r->cap  = new_cap;
+  r->mask = new_cap - 1;
+  r->tail = 0;
+  r->head = used;
+}
+
+// ============================================================
+// Push / Pop (pointer-based, zero-copy)
+// ============================================================
+
+// Get a writable pointer for `len` bytes. Auto-grows if needed.
+// The returned pointer is valid for contiguous write of `len` bytes
+// even if it crosses the physical buffer boundary (Ouroboros property).
+// Returns NULL only on allocation failure.
+// Caller MUST call ring_push_commit(r, len) after writing.
+fn void *ring_push_ptr(ElasticRing *r, size_t len) {
+  while (r->count + len > r->cap) {
+    if (!ring_grow(r)) return NULL;
+  }
+  return r->data + r->head;
+}
+
+// Advance head after a successful ring_push_ptr write.
+fn void ring_push_commit(ElasticRing *r, size_t len) {
+  r->head = (r->head + len) & r->mask;
+  r->count += len;
+}
+
+// Get a readable pointer for `len` bytes.
+// The returned pointer is valid for contiguous read of `len` bytes
+// even if it crosses the physical buffer boundary (Ouroboros property).
+// Returns NULL if fewer than `len` bytes are available.
+// Caller MUST call ring_pop_commit(r, len) after reading.
+fn void *ring_pop_ptr(ElasticRing *r, size_t len) {
+  if (r->count < len) return NULL;
+  return r->data + r->tail;
+}
+
+// Advance tail after a successful ring_pop_ptr read.
+fn void ring_pop_commit(ElasticRing *r, size_t len) {
+  r->tail = (r->tail + len) & r->mask;
+  r->count -= len;
+}
+
+// ============================================================
+// Convenience: u64 element push/pop
+// ============================================================
+
+fn bool ring_push_u64(ElasticRing *r, u64 val) {
+  void *p = ring_push_ptr(r, sizeof(u64));
+  if (!p) return false;
+  *(u64 *)p = val;
+  ring_push_commit(r, sizeof(u64));
+  return true;
+}
+
+fn bool ring_pop_u64(ElasticRing *r, u64 *out) {
+  void *p = ring_pop_ptr(r, sizeof(u64));
+  if (!p) return false;
+  *out = *(u64 *)p;
+  ring_pop_commit(r, sizeof(u64));
+  return true;
+}
+
+// ============================================================
+// Self-test
+// ============================================================
+
+fn int ring_test(void) {
+  ElasticRing r;
+  size_t pg = ering_page_size();
+
+  // 1. Init
+  if (!ring_init(&r, pg)) {
+    fprintf(stderr, "ring_test: init failed\n");
+    return 1;
+  }
+  assert(r.cap == pg);
+  assert(r.count == 0);
+
+  // 2. Basic push/pop
+  u64 v;
+  assert(ring_push_u64(&r, 42));
+  assert(ring_pop_u64(&r, &v) && v == 42);
+  assert(ring_used(&r) == 0);
+
+  // 3. Fill to capacity, drain
+  size_t n = r.cap / sizeof(u64);
+  for (size_t i = 0; i < n; i++) assert(ring_push_u64(&r, i + 100));
+  assert(ring_avail(&r) == 0);
+  for (size_t i = 0; i < n; i++) {
+    assert(ring_pop_u64(&r, &v));
+    assert(v == i + 100);
+  }
+  assert(ring_used(&r) == 0);
+
+  // 4. Ouroboros wrap-around: advance head/tail past boundary, then
+  //    push data that straddles the physical buffer end.
+  for (size_t i = 0; i < n * 3 / 4; i++) assert(ring_push_u64(&r, 0xDEAD));
+  for (size_t i = 0; i < n * 3 / 4; i++) assert(ring_pop_u64(&r, &v));
+  // tail & head are now at 3/4 of cap. Push n/2 — head wraps past cap.
+  for (size_t i = 0; i < n / 2; i++) assert(ring_push_u64(&r, i + 5000));
+  for (size_t i = 0; i < n / 2; i++) {
+    assert(ring_pop_u64(&r, &v));
+    assert(v == i + 5000);
+  }
+
+  // 5. Contiguous bulk read across wrap boundary.
+  for (size_t i = 0; i < n * 3 / 4; i++) assert(ring_push_u64(&r, 0xBEEF));
+  for (size_t i = 0; i < n * 3 / 4; i++) assert(ring_pop_u64(&r, &v));
+  size_t wn = n / 2;
+  for (size_t i = 0; i < wn; i++) assert(ring_push_u64(&r, i + 9000));
+  void *bulk = ring_pop_ptr(&r, wn * sizeof(u64));
+  assert(bulk != NULL);
+  u64 *arr = (u64 *)bulk;
+  for (size_t i = 0; i < wn; i++) assert(arr[i] == i + 9000);
+  ring_pop_commit(&r, wn * sizeof(u64));
+
+  // 6. Growth preserves data.
+  size_t old_cap = r.cap;
+  for (size_t i = 0; i < n + 1; i++) assert(ring_push_u64(&r, i + 7000));
+  assert(r.cap > old_cap);
+  for (size_t i = 0; i < n + 1; i++) {
+    assert(ring_pop_u64(&r, &v));
+    assert(v == i + 7000);
+  }
+
+  // 7. Growth with wrapped data: fill 3/4, pop 1/4, push full cap → forces
+  //    growth while live data wraps around the old buffer boundary.
+  n = r.cap / sizeof(u64);  // recalc after growth
+  for (size_t i = 0; i < n * 3 / 4; i++) assert(ring_push_u64(&r, i + 100000));
+  for (size_t i = 0; i < n / 4; i++) {
+    assert(ring_pop_u64(&r, &v));
+    assert(v == i + 100000);
+  }
+  // Now used = n/2, head is past 3/4, tail is at 1/4.
+  // Push enough to overflow → triggers growth while data wraps.
+  size_t remaining_before = ring_used(&r) / sizeof(u64);
+  size_t to_push = n;  // more than avail → forces growth
+  for (size_t i = 0; i < to_push; i++) assert(ring_push_u64(&r, i + 200000));
+  // Verify the earlier data.
+  for (size_t i = n / 4; i < n * 3 / 4; i++) {
+    assert(ring_pop_u64(&r, &v));
+    assert(v == i + 100000);
+  }
+  // Verify the new data.
+  for (size_t i = 0; i < to_push; i++) {
+    assert(ring_pop_u64(&r, &v));
+    assert(v == i + 200000);
+  }
+  assert(ring_used(&r) == 0);
+
+  // 8. Shrink.
+  assert(ring_push_u64(&r, 12345));
+  ring_shrink(&r);
+  assert(ring_pop_u64(&r, &v) && v == 12345);
+
+  ring_free(&r);
+  fprintf(stderr, "[elastic_ring] all tests passed\n");
+  return 0;
+}
diff --git a/clang/data/uset.c b/clang/data/uset.c
index ff625907..4eee5786 100644
--- a/clang/data/uset.c
+++ b/clang/data/uset.c
@@ -24,9 +24,10 @@ fn u64 uset_words_for_heap(void) {
   return (HEAP_CAP + 63ull) >> 6;
 }
 
-// Initialize the set bitmap.
-fn void uset_init(Uset *set) {
-  u64 words = uset_words_for_heap();
+// Initialize the set bitmap covering max_locs heap locations.
+fn void uset_init_sized(Uset *set, u64 max_locs) {
+  u64 words = (max_locs + 63ull) >> 6;
+  if (words == 0) words = 1;
   set->words = (_Atomic u64 *)calloc((size_t)words, sizeof(u64));
   if (!set->words) {
     fprintf(stderr, "uset: allocation failed\n");
@@ -35,6 +36,11 @@ fn void uset_init(Uset *set) {
   set->word_count = words;
 }
 
+// Initialize the set bitmap covering all HEAP_CAP locations.
+fn void uset_init(Uset *set) {
+  uset_init_sized(set, HEAP_CAP);
+}
+
 // Release the bitmap and reset the set state.
 fn void uset_free(Uset *set) {
   if (set->words) {
@@ -58,13 +64,14 @@ fn u8 uset_has(Uset *set, u32 key) {
 }
 
 // Insert key if missing; returns 1 if inserted, 0 if already present.
+// For out-of-range keys: returns 1 (treat as new) to avoid skipping work.
 fn u8 uset_add(Uset *set, u32 key) {
   if (key == 0) {
     return 0;
   }
   u64 word_idx = ((u64)key) >> 6;
-  if (word_idx >= set->word_count) {
-    return 0;
+  if (__builtin_expect(word_idx >= set->word_count, 0)) {
+    return 1;
   }
   u64 bit_mask = 1ull << (key & 63u);
   u64 prev = atomic_fetch_or_explicit(&set->words[word_idx], bit_mask, memory_order_relaxed);
diff --git a/clang/data/wspq.c b/clang/data/wspq.c
index c68b2654..bcb7b775 100644
--- a/clang/data/wspq.c
+++ b/clang/data/wspq.c
@@ -55,6 +55,7 @@ typedef struct __attribute__((aligned(256))) {
 typedef struct {
   WspqBank bank[MAX_THREADS];
   u32 n;
+  u32 brackets;  // actual number of brackets in use (≤ WSPQ_BRACKETS)
 } Wspq;
 
 // Return index of least-significant set bit (undefined for m == 0).
@@ -81,16 +82,19 @@ static inline u8 wspq_key_bucket(u32 key) {
   return (u8)bucket;
 }
 
-// Initialize all per-worker bucket queues.
-static inline bool wspq_init(Wspq *ws, u32 nthreads) {
+// Initialize all per-worker bucket queues with specified bracket count and capacity.
+static inline bool wspq_init_sized(Wspq *ws, u32 nthreads, u32 brackets, u32 cap_pow2) {
   ws->n = nthreads;
+  if (brackets > WSPQ_BRACKETS) brackets = WSPQ_BRACKETS;
+  if (brackets < 1) brackets = 1;
+  ws->brackets = brackets;
 
   for (u32 t = 0; t < nthreads; ++t) {
     atomic_store_explicit(&ws->bank[t].nonempty.v, 0ull, memory_order_relaxed);
-    for (u32 b = 0; b < WSPQ_BRACKETS; ++b) {
-      if (!wsq_init(&ws->bank[t].q[b], WSPQ_CAP_POW2)) {
+    for (u32 b = 0; b < brackets; ++b) {
+      if (!wsq_init(&ws->bank[t].q[b], cap_pow2)) {
         for (u32 t2 = 0; t2 <= t; ++t2) {
-          u32 bmax = WSPQ_BRACKETS;
+          u32 bmax = brackets;
           if (t2 == t) {
             bmax = b;
           }
@@ -105,10 +109,15 @@ static inline bool wspq_init(Wspq *ws, u32 nthreads) {
   return true;
 }
 
+// Initialize with default brackets and capacity (backward compatible).
+static inline bool wspq_init(Wspq *ws, u32 nthreads) {
+  return wspq_init_sized(ws, nthreads, WSPQ_BRACKETS, WSPQ_CAP_POW2);
+}
+
 // Free all per-worker bucket queues.
 static inline void wspq_free(Wspq *ws) {
   for (u32 t = 0; t < ws->n; ++t) {
-    for (u32 b = 0; b < WSPQ_BRACKETS; ++b) {
+    for (u32 b = 0; b < ws->brackets; ++b) {
       wsq_free(&ws->bank[t].q[b]);
     }
   }
@@ -120,7 +129,7 @@ static inline bool wspq_bucket_full_all(Wspq *ws, u8 b) {
     WsDeque *q = &ws->bank[t].q[b];
     size_t bot = atomic_load_explicit(&q->bot.v, memory_order_relaxed);
     size_t top = atomic_load_explicit(&q->top.v, memory_order_relaxed);
-    if (bot - top < q->cap) {
+    if (bot - top < wsq_capacity(q)) {
       return false;
     }
   }
@@ -132,7 +141,9 @@ static inline void wspq_push(Wspq *ws, u32 tid, u8 key, u64 task) {
     return;
   }
   u8 bucket = wspq_key_bucket(key);
+  if (bucket >= ws->brackets) bucket = (u8)(ws->brackets - 1);
   WsDeque *q = &ws->bank[tid].q[bucket];
+  // wsq_push now grows on full, so this loop body rarely executes.
   u32 spins = 1;
   while (!wsq_push(q, task)) {
     if ((spins % WSPQ_DEADLOCK_CHECK_PERIOD) == 0) {
@@ -179,10 +190,10 @@ static inline u32 wspq_steal_some(
     return 0u;
   }
 
-  u32 b_limit = WSPQ_BRACKETS;
+  u32 b_limit = ws->brackets;
   if (restrict_deeper) {
     u64 my_mask = atomic_load_explicit(&ws->bank[me].nonempty.v, memory_order_relaxed);
-    u32 my_min = WSPQ_BRACKETS;
+    u32 my_min = ws->brackets;
     if (my_mask != 0ull) {
       my_min = wspq_lsb64(my_mask);
     }
@@ -190,7 +201,7 @@ static inline u32 wspq_steal_some(
   }
 
   u64 allowed_mask = ~0ull;
-  if (b_limit < WSPQ_BRACKETS) {
+  if (b_limit < ws->brackets) {
     allowed_mask = (1ull << b_limit) - 1ull;
   }
 
diff --git a/clang/data/wsq.c b/clang/data/wsq.c
index a4bd2b41..a3612947 100644
--- a/clang/data/wsq.c
+++ b/clang/data/wsq.c
@@ -1,18 +1,21 @@
-// data/wsq.c - Chase-Lev work-stealing deque for u64 tasks.
+// data/wsq.c - Resizable Chase-Lev work-stealing deque for u64 tasks.
 //
 // Context
 // - Used by parallel evaluators to distribute heap locations across workers.
 // - Single-owner pushes and pops from the bottom; other threads steal from the top.
 //
 // Design
-// - Ring buffer of fixed capacity (power of two) storing u64 tasks.
+// - Circular array (WsqArray) of power-of-two capacity storing u64 tasks.
+// - Owner-initiated 2x growth when full: allocate new array, copy live elements,
+//   atomically publish new array pointer (release). Old arrays are deferred-freed.
+// - Thieves load array pointer with acquire after loading top — they see either
+//   old or new array. Old array data is valid (never written after growth).
+//   CAS on top prevents double-consumption regardless of which array was read.
 // - Atomic top/bottom indices are cache-line padded to limit false sharing.
-// - Owner operations are wait-free except for full/empty checks.
-// - Steals are lock-free and may fail under contention.
 //
 // Notes
-// - Not multi-producer: only the owner thread may push/pop.
-// - Capacity is fixed after init; wsq_push returns 0 when full.
+// - Not multi-producer: only the owner thread may push/pop/grow.
+// - wsq_push grows the deque if full; returns 0 only on OOM (fatal).
 // - Counters are monotonic; wrap-around is not guarded (practically unreachable).
 
 #include <stdatomic.h>
@@ -20,13 +23,22 @@
 #include <stddef.h>
 #include <stdlib.h>
 
+// Backing array for the deque (swapped atomically on growth).
+typedef struct {
+  u64    *buf;
+  size_t  mask;  // cap - 1
+} WsqArray;
+
+// Maximum number of old arrays kept alive until wsq_free.
+#define WSQ_PREV_MAX 16
+
 // Work-stealing deque state (single owner, multi-stealer).
 typedef struct __attribute__((aligned(CACHE_L1))) {
   _Alignas(CACHE_L1) CachePaddedAtomic top;
   _Alignas(CACHE_L1) CachePaddedAtomic bot;
-  _Alignas(CACHE_L1) u64 *buf;
-  size_t mask;
-  size_t cap;
+  _Alignas(CACHE_L1) _Atomic(WsqArray *) arr;
+  WsqArray *prev[WSQ_PREV_MAX];
+  u32 prev_count;
 } WsDeque;
 
 // Allocate aligned memory for the ring buffer.
@@ -40,37 +52,80 @@ static inline void *wsq_aligned_alloc(size_t alignment, size_t nbytes) {
   return ptr;
 }
 
+// Allocate a new WsqArray with the given capacity.
+static inline WsqArray *wsq_array_new(size_t cap) {
+  WsqArray *a = (WsqArray *)malloc(sizeof(WsqArray));
+  if (!a) return NULL;
+  a->buf = (u64 *)wsq_aligned_alloc(CACHE_L1, cap * sizeof(u64));
+  if (!a->buf) { free(a); return NULL; }
+  a->mask = cap - 1;
+  return a;
+}
+
+// Free a WsqArray and its buffer.
+static inline void wsq_array_free(WsqArray *a) {
+  if (a) {
+    free(a->buf);
+    free(a);
+  }
+}
+
+// Owner-only: grow the deque to 2x capacity. Returns new array.
+// Copies live elements [top, bot) from old to new.
+// Old array is stashed in prev[] for deferred free.
+static inline WsqArray *wsq_grow(WsDeque *q, WsqArray *old, u64 bot, u64 top) {
+  size_t new_cap = (old->mask + 1) * 2;
+  WsqArray *a = wsq_array_new(new_cap);
+  if (!a) {
+    fprintf(stderr, "wsq_grow: allocation failed (new_cap=%zu)\n", new_cap);
+    exit(1);
+  }
+  // Copy live elements from old to new array.
+  for (u64 i = top; i < bot; i++) {
+    a->buf[i & a->mask] = old->buf[i & old->mask];
+  }
+  // Stash old array for deferred free.
+  if (q->prev_count < WSQ_PREV_MAX) {
+    q->prev[q->prev_count++] = old;
+  }
+  // Publish new array (thieves will see it after acquire on arr).
+  atomic_store_explicit(&q->arr, a, memory_order_release);
+  return a;
+}
+
 // Initialize a deque with 2^capacity_pow2 slots.
 static inline int wsq_init(WsDeque *q, u32 capacity_pow2) {
   size_t cap = (size_t)1 << capacity_pow2;
-  q->buf = (u64 *)wsq_aligned_alloc(CACHE_L1, cap * sizeof(u64));
-  if (!q->buf) {
-    return 0;
-  }
-  q->cap  = cap;
-  q->mask = cap - 1;
+  WsqArray *a = wsq_array_new(cap);
+  if (!a) return 0;
+  atomic_store_explicit(&q->arr, a, memory_order_relaxed);
   atomic_store_explicit(&q->top.v, 0, memory_order_relaxed);
   atomic_store_explicit(&q->bot.v, 0, memory_order_relaxed);
+  q->prev_count = 0;
   return 1;
 }
 
-// Release the deque buffer.
+// Release the deque: free current array and all stashed old arrays.
 static inline void wsq_free(WsDeque *q) {
-  if (q && q->buf) {
-    free(q->buf);
-    q->buf = NULL;
+  if (!q) return;
+  WsqArray *a = atomic_load_explicit(&q->arr, memory_order_relaxed);
+  wsq_array_free(a);
+  for (u32 i = 0; i < q->prev_count; i++) {
+    wsq_array_free(q->prev[i]);
   }
+  q->prev_count = 0;
+  atomic_store_explicit(&q->arr, NULL, memory_order_relaxed);
 }
 
-// Owner push to the bottom; returns 1 on success, 0 if full.
+// Owner push to the bottom; grows if full. Returns 1 always (exits on OOM).
 static inline int wsq_push(WsDeque *q, u64 x) {
   u64 b = atomic_load_explicit(&q->bot.v, memory_order_relaxed);
   u64 t = atomic_load_explicit(&q->top.v, memory_order_acquire);
-  if (b - t >= q->cap) {
-    return 0;
+  WsqArray *a = atomic_load_explicit(&q->arr, memory_order_relaxed);
+  if (__builtin_expect(b - t > a->mask, 0)) {
+    a = wsq_grow(q, a, b, t);
   }
-  __builtin_prefetch(&q->buf[b & q->mask], 1, 1);
-  q->buf[b & q->mask] = x;
+  a->buf[b & a->mask] = x;
   atomic_store_explicit(&q->bot.v, b + 1, memory_order_release);
   return 1;
 }
@@ -82,13 +137,13 @@ static inline int wsq_pop(WsDeque *q, u64 *out) {
     return 0;
   }
   u64 b1 = b - 1;
-  __builtin_prefetch(&q->buf[b1 & q->mask], 0, 1);
   atomic_store_explicit(&q->bot.v, b1, memory_order_release);
   atomic_thread_fence(memory_order_seq_cst);
 
   u64 t = atomic_load_explicit(&q->top.v, memory_order_acquire);
   if (t <= b1) {
-    u64 x = q->buf[b1 & q->mask];
+    WsqArray *a = atomic_load_explicit(&q->arr, memory_order_relaxed);
+    u64 x = a->buf[b1 & a->mask];
     if (t == b1) {
       u64 expected = t;
       bool ok = atomic_compare_exchange_strong_explicit(
@@ -119,8 +174,8 @@ static inline int wsq_steal(WsDeque *q, u64 *out) {
   if (t >= b) {
     return 0;
   }
-  __builtin_prefetch(&q->buf[t & q->mask], 0, 1);
-  u64 x = q->buf[t & q->mask];
+  WsqArray *a = atomic_load_explicit(&q->arr, memory_order_acquire);
+  u64 x = a->buf[t & a->mask];
   u64 expected = t;
   bool ok = atomic_compare_exchange_strong_explicit(
     &q->top.v,
@@ -136,9 +191,15 @@ static inline int wsq_steal(WsDeque *q, u64 *out) {
   return 0;
 }
 
-
+// Check if there are stealable items (non-binding).
 static inline bool wsq_can_steal(WsDeque *q) {
   u64 t = atomic_load_explicit(&q->top.v, memory_order_acquire);
   u64 b = atomic_load_explicit(&q->bot.v, memory_order_acquire);
   return t < b;
-}
\ No newline at end of file
+}
+
+// Read current capacity (for external checks like deadlock detection).
+static inline size_t wsq_capacity(WsDeque *q) {
+  WsqArray *a = atomic_load_explicit(&q->arr, memory_order_relaxed);
+  return a ? a->mask + 1 : 0;
+}
diff --git a/clang/eval/collapse.c b/clang/eval/collapse.c
index 6b18d211..4fd1e8d5 100644
--- a/clang/eval/collapse.c
+++ b/clang/eval/collapse.c
@@ -192,11 +192,31 @@ fn void eval_collapse(Term term, int limit, int show_itrs, int silent) {
 
   C.silent = silent;
   C.show_itrs = show_itrs;
-  if (!wspq_init(&C.ws, n)) {
+
+  // Compute wspq sizing from hints.
+  u32 col_brackets = WSPQ_BRACKETS;
+  u32 col_cap_pow2 = WSPQ_CAP_POW2;
+  if (HVM_HINTS.node_count > 0) {
+    col_brackets = HVM_HINTS.max_depth + 4;
+    if (col_brackets > WSPQ_BRACKETS) col_brackets = WSPQ_BRACKETS;
+    if (col_brackets < 4) col_brackets = 4;
+    if (HVM_HINTS.has_sup) {
+      col_cap_pow2 = hints_cap_pow2(HVM_HINTS.sup_count * 4, 8, 24);
+    } else {
+      col_cap_pow2 = 8; // minimal: 256 entries per bucket
+    }
+  }
+  if (!wspq_init_sized(&C.ws, n, col_brackets, col_cap_pow2)) {
     fprintf(stderr, "eval_collapse: queue allocation failed\n");
     exit(1);
   }
-  if (!cnf_pool_init(&C.cnf, n)) {
+
+  // Compute cnf pool sizing from hints.
+  u32 cnf_cap_pow2 = CNF_POOL_WS_CAP_POW2;
+  if (HVM_HINTS.node_count > 0) {
+    cnf_cap_pow2 = hints_cap_pow2(HVM_HINTS.node_count / 8, 8, 20);
+  }
+  if (!cnf_pool_init_sized(&C.cnf, n, cnf_cap_pow2)) {
     fprintf(stderr, "eval_collapse: cnf queue allocation failed\n");
     exit(1);
   }
diff --git a/clang/eval/normalize.c b/clang/eval/normalize.c
index 1c694ecf..9badd06b 100644
--- a/clang/eval/normalize.c
+++ b/clang/eval/normalize.c
@@ -140,13 +140,28 @@ fn Term eval_normalize(Term term) {
   u32 n = thread_get_count();
   ctx.n = n;
   atomic_store_explicit(&ctx.pending.v, n, memory_order_relaxed);
+
+  // Compute queue capacity from hints.
+  u32 norm_cap_pow2 = EVAL_NORMALIZE_WS_CAP_POW2;
+  if (HVM_HINTS.node_count > 0) {
+    norm_cap_pow2 = hints_cap_pow2(HVM_HINTS.node_count / 4, 8, 24);
+  }
   for (u32 i = 0; i < n; i++) {
-    if (!wsq_init(&ctx.W[i].dq, EVAL_NORMALIZE_WS_CAP_POW2)) {
+    if (!wsq_init(&ctx.W[i].dq, norm_cap_pow2)) {
       fprintf(stderr, "eval_normalize: queue allocation failed\n");
       exit(1);
     }
   }
-  uset_init(&ctx.seen);
+
+  // Compute uset size from hints.
+  if (HVM_HINTS.static_heap > 0) {
+    u64 uset_locs = HVM_HINTS.static_heap * 64;
+    if (uset_locs < 4096) uset_locs = 4096;
+    if (uset_locs > HEAP_CAP) uset_locs = HEAP_CAP;
+    uset_init_sized(&ctx.seen, uset_locs);
+  } else {
+    uset_init(&ctx.seen);
+  }
 
   eval_normalize_enqueue(&ctx, &ctx.W[0], root_loc);
 
diff --git a/clang/hvm4.c b/clang/hvm4.c
index 4fbf21c3..f99e7016 100644
--- a/clang/hvm4.c
+++ b/clang/hvm4.c
@@ -504,12 +504,20 @@ static int    PARSE_FORK_SIDE = -1;      // -1 = off, 0 = left branch (DP0), 1 =
 #include "data/uset.c"
 #include "data/wsq.c"
 #include "data/wspq.c"
+#include "data/elastic_ring.c"
 
 // CNF
 // ===
 
 #include "cnf/_.c"
 
+// Analyze
+// =======
+
+#include "analyze/hints.c"
+
+static HvmHints HVM_HINTS = {0};
+
 // Eval
 // ====
 
diff --git a/clang/main.c b/clang/main.c
index c5d2b5f4..2ee9e10e 100644
--- a/clang/main.c
+++ b/clang/main.c
@@ -4,13 +4,14 @@
 // This file provides the command-line interface for the HVM4 runtime,
 // mirroring the structure of main.hs for the Haskell implementation.
 //
-// Usage: ./main <file.hvm4> [-s] [-S] [-D] [-C[N]] [-T<N>]
+// Usage: ./main <file.hvm4> [-s] [-S] [-D] [-C[N]] [-T<N>] [-v]
 //   -s:  Show statistics (interactions, time, performance)
 //   -S:  Silent output (omit term printing)
 //   -D:  Step-by-step reduction (print intermediate terms)
 //   -C:  Collapse and flatten (enumerate all superposition branches)
 //   -CN: Collapse and flatten, limit to N results
 //   -T:  Use N threads (e.g. -T4)
+//   -v:  Verbose: print program hints after parsing
 
 #include "hvm4.c"
 
@@ -32,6 +33,8 @@ typedef struct {
   int   debug;
   int   step_by_step;
   int   threads;
+  int   verbose;
+  int   test_ring;
   u32     ffi_loads_len;
   FfiLoad ffi_loads[FFI_MAX];
   char *file;
@@ -46,6 +49,8 @@ fn CliOpts parse_opts(int argc, char **argv) {
     .debug = 0,
     .step_by_step = 0,
     .threads = 0,
+    .verbose = 0,
+    .test_ring = 0,
     .ffi_loads_len = 0,
     .file = NULL
   };
@@ -74,6 +79,10 @@ fn CliOpts parse_opts(int argc, char **argv) {
         fprintf(stderr, "Error: -T value (%d) exceeds MAX_THREADS (%d)\n", opts.threads, MAX_THREADS);
         exit(1);
       }
+    } else if (strcmp(argv[i], "-v") == 0) {
+      opts.verbose = 1;
+    } else if (strcmp(argv[i], "--test-ring") == 0) {
+      opts.test_ring = 1;
     } else if (strcmp(argv[i], "-d") == 0) {
       opts.debug = 1;
     } else if (strcmp(argv[i], "-D") == 0) {
@@ -133,8 +142,12 @@ int main(int argc, char **argv) {
   // Parse command line
   CliOpts opts = parse_opts(argc, argv);
 
+  if (opts.test_ring) {
+    return ring_test();
+  }
+
   if (opts.file == NULL) {
-    fprintf(stderr, "Usage: ./main <file.hvm4> [-s] [-S] [-D] [-C[N]] [-T<N>] [--ffi <path>] [--ffi-dir <path>]\n");
+    fprintf(stderr, "Usage: ./main <file.hvm4> [-s] [-S] [-D] [-C[N]] [-T<N>] [-v] [--ffi <path>] [--ffi-dir <path>]\n");
     return 1;
   }
 
@@ -203,6 +216,12 @@ int main(int argc, char **argv) {
   parse_def(&s);
   free(src);
 
+  // Analyze program and compute buffer sizing hints
+  HVM_HINTS = hvm_analyze();
+  if (opts.verbose) {
+    hvm_hints_print(&HVM_HINTS);
+  }
+
   // Get @main id
   u32 main_id = table_find("main", 4);
 

From 19d2bc11f779a04933c0abadbbfb1dd96e5ec20f Mon Sep 17 00:00:00 2001
From: christos chatzifountas <christos.chatzifountas@biotz.io>
Date: Fri, 13 Feb 2026 13:36:02 +0200
Subject: [PATCH 3/6] Add Stable Index Vector (SIV) for dense cancel-friendly
 storage

Standalone data structure for future Hybrid SIV + Ring work queues.
Dense swap-compacted array with stable IDs, separate data/ID capacity
growth, and O(1) push/erase/valid/get operations. Self-test via
--test-siv flag.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 clang/data/siv.c | 245 +++++++++++++++++++++++++++++++++++++++++++++++
 clang/hvm4.c     |   1 +
 clang/main.c     |   7 ++
 3 files changed, 253 insertions(+)
 create mode 100644 clang/data/siv.c

diff --git a/clang/data/siv.c b/clang/data/siv.c
new file mode 100644
index 00000000..9e00d75a
--- /dev/null
+++ b/clang/data/siv.c
@@ -0,0 +1,245 @@
+// data/siv.c - Stable Index Vector for dense, cancel-friendly storage.
+//
+// Context
+// - Companion to ring/deque-based work queues: ring holds ordering (u32 IDs),
+//   SIV holds the actual data (u64 values) in a dense, swap-compacted array.
+// - IDs are stable: an ID remains valid until explicitly erased, regardless
+//   of other insertions or deletions.
+//
+// Design
+// - Dense data[] array: live values packed at indices [0, count).
+// - Two maps: id_to_slot (ID -> position in data[]) and slot_to_id (inverse).
+// - Push: append at data[count], assign monotonic next_id, update both maps.
+// - Erase: swap data[slot] with data[count-1], update maps, decrement count.
+// - Both are O(1). Iteration over live entries is a cache-friendly linear scan.
+//
+// Notes
+// - Single-threaded (owner only). For work-stealing, the owner pushes IDs into
+//   a concurrent ring; thieves read SIV data via the ID after stealing.
+// - ID space is monotonic: id_to_slot[] grows with total pushes (not live count).
+//   Separate capacity tracking for data[] (count-bound) and id_to_slot[] (id-bound).
+// - Erase of an already-erased ID is a safe no-op.
+
+#include <stdlib.h>
+#include <string.h>
+
+#define SIV_INVALID 0xFFFFFFFFu
+
+typedef struct {
+  u64 *data;         // dense array of live values [0, count)
+  u32 *id_to_slot;   // id -> slot in data[] (SIV_INVALID if erased/unassigned)
+  u32 *slot_to_id;   // slot -> id (inverse map for swap-on-erase)
+  u32  count;        // number of live entries
+  u32  cap;          // capacity of data[] and slot_to_id[]
+  u32  next_id;      // monotonic ID counter (total pushes)
+  u32  id_cap;       // capacity of id_to_slot[]
+} Siv;
+
+// Initialize with given capacity (for both data and ID space).
+fn bool siv_init(Siv *s, u32 initial_cap) {
+  if (initial_cap == 0) initial_cap = 64;
+  s->data       = (u64 *)malloc((size_t)initial_cap * sizeof(u64));
+  s->slot_to_id = (u32 *)malloc((size_t)initial_cap * sizeof(u32));
+  s->id_to_slot = (u32 *)malloc((size_t)initial_cap * sizeof(u32));
+  if (!s->data || !s->slot_to_id || !s->id_to_slot) {
+    free(s->data); free(s->slot_to_id); free(s->id_to_slot);
+    memset(s, 0, sizeof(*s));
+    return false;
+  }
+  memset(s->id_to_slot, 0xFF, (size_t)initial_cap * sizeof(u32)); // all SIV_INVALID
+  s->count   = 0;
+  s->cap     = initial_cap;
+  s->next_id = 0;
+  s->id_cap  = initial_cap;
+  return true;
+}
+
+fn void siv_free(Siv *s) {
+  free(s->data);
+  free(s->slot_to_id);
+  free(s->id_to_slot);
+  memset(s, 0, sizeof(*s));
+}
+
+// Grow data[] and slot_to_id[] to 2x capacity.
+static inline bool siv_grow_data(Siv *s) {
+  u32 new_cap = s->cap * 2;
+  u64 *nd = (u64 *)realloc(s->data, (size_t)new_cap * sizeof(u64));
+  u32 *ns = (u32 *)realloc(s->slot_to_id, (size_t)new_cap * sizeof(u32));
+  if (!nd || !ns) {
+    if (nd) s->data = nd;       // partial realloc ok, keep the successful one
+    if (ns) s->slot_to_id = ns;
+    return false;
+  }
+  s->data       = nd;
+  s->slot_to_id = ns;
+  s->cap        = new_cap;
+  return true;
+}
+
+// Grow id_to_slot[] to accommodate next_id.
+static inline bool siv_grow_ids(Siv *s) {
+  u32 new_id_cap = s->id_cap * 2;
+  u32 *ni = (u32 *)realloc(s->id_to_slot, (size_t)new_id_cap * sizeof(u32));
+  if (!ni) return false;
+  // Initialize new slots to SIV_INVALID.
+  memset(ni + s->id_cap, 0xFF, (size_t)(new_id_cap - s->id_cap) * sizeof(u32));
+  s->id_to_slot = ni;
+  s->id_cap     = new_id_cap;
+  return true;
+}
+
+// Insert a value. Returns the stable ID, or SIV_INVALID on allocation failure.
+fn u32 siv_push(Siv *s, u64 val) {
+  // Ensure data capacity.
+  if (s->count >= s->cap) {
+    if (!siv_grow_data(s)) return SIV_INVALID;
+  }
+  // Ensure ID capacity.
+  if (s->next_id >= s->id_cap) {
+    if (!siv_grow_ids(s)) return SIV_INVALID;
+  }
+
+  u32 id   = s->next_id++;
+  u32 slot = s->count++;
+
+  s->data[slot]       = val;
+  s->slot_to_id[slot] = id;
+  s->id_to_slot[id]   = slot;
+  return id;
+}
+
+// Check if an ID is still live.
+fn bool siv_valid(Siv *s, u32 id) {
+  return id < s->next_id && id < s->id_cap && s->id_to_slot[id] != SIV_INVALID;
+}
+
+// Retrieve value by ID. Caller must check siv_valid() first.
+fn u64 siv_get(Siv *s, u32 id) {
+  return s->data[s->id_to_slot[id]];
+}
+
+// Erase by ID. Swap-deletes from dense array. Safe no-op if already erased.
+fn void siv_erase(Siv *s, u32 id) {
+  if (id >= s->id_cap || s->id_to_slot[id] == SIV_INVALID) return;
+
+  u32 slot = s->id_to_slot[id];
+  u32 last = s->count - 1;
+
+  if (slot != last) {
+    // Move last element into the erased slot.
+    s->data[slot]       = s->data[last];
+    u32 moved_id        = s->slot_to_id[last];
+    s->slot_to_id[slot] = moved_id;
+    s->id_to_slot[moved_id] = slot;
+  }
+
+  s->id_to_slot[id] = SIV_INVALID;
+  s->count--;
+}
+
+// Live entry count.
+fn u32 siv_count(Siv *s) { return s->count; }
+
+// Dense data pointer for iteration: for (u32 i = 0; i < siv_count(s); i++) s->data[i]
+fn u64 *siv_data(Siv *s) { return s->data; }
+
+// Get the ID for a given dense slot (for iteration with ID tracking).
+fn u32 siv_slot_id(Siv *s, u32 slot) { return s->slot_to_id[slot]; }
+
+// ============================================================
+// Self-test
+// ============================================================
+
+fn int siv_test(void) {
+  Siv s;
+
+  // 1. Init
+  if (!siv_init(&s, 4)) {
+    fprintf(stderr, "siv_test: init failed\n");
+    return 1;
+  }
+  assert(siv_count(&s) == 0);
+
+  // 2. Push and retrieve
+  u32 id0 = siv_push(&s, 100);
+  u32 id1 = siv_push(&s, 200);
+  u32 id2 = siv_push(&s, 300);
+  assert(id0 != SIV_INVALID && id1 != SIV_INVALID && id2 != SIV_INVALID);
+  assert(siv_count(&s) == 3);
+  assert(siv_get(&s, id0) == 100);
+  assert(siv_get(&s, id1) == 200);
+  assert(siv_get(&s, id2) == 300);
+
+  // 3. Erase middle — swap compaction
+  siv_erase(&s, id1);
+  assert(siv_count(&s) == 2);
+  assert(!siv_valid(&s, id1));
+  assert(siv_valid(&s, id0));
+  assert(siv_valid(&s, id2));
+  assert(siv_get(&s, id0) == 100);
+  assert(siv_get(&s, id2) == 300);
+
+  // 4. Double erase is no-op
+  siv_erase(&s, id1);
+  assert(siv_count(&s) == 2);
+
+  // 5. Dense iteration sees exactly the live values (order may differ after swap)
+  u64 sum = 0;
+  for (u32 i = 0; i < siv_count(&s); i++) sum += siv_data(&s)[i];
+  assert(sum == 400); // 100 + 300
+
+  // 6. Push after erase reuses dense slot
+  u32 id3 = siv_push(&s, 400);
+  assert(siv_count(&s) == 3);
+  assert(siv_get(&s, id3) == 400);
+
+  // 7. Growth: push beyond initial capacity (was 4)
+  u32 id4 = siv_push(&s, 500);
+  u32 id5 = siv_push(&s, 600);
+  assert(siv_count(&s) == 5);
+  assert(siv_get(&s, id4) == 500);
+  assert(siv_get(&s, id5) == 600);
+  // All earlier IDs still valid
+  assert(siv_get(&s, id0) == 100);
+  assert(siv_get(&s, id2) == 300);
+  assert(siv_get(&s, id3) == 400);
+
+  // 8. ID space growth: push many, erase many, push more
+  //    This exercises id_cap growth independently of data cap growth.
+  for (u32 i = 0; i < 200; i++) {
+    u32 id = siv_push(&s, 1000 + i);
+    assert(id != SIV_INVALID);
+  }
+  assert(siv_count(&s) == 205); // 5 + 200
+  // Erase all but last 10
+  for (u32 i = 0; i < siv_count(&s) - 10; ) {
+    u32 eid = siv_slot_id(&s, i);
+    siv_erase(&s, eid);
+    // After erase, slot i has the swapped-in element — don't increment
+    if (siv_count(&s) <= 10) break;
+  }
+  assert(siv_count(&s) == 10);
+  // Push more — these get new IDs past the old high-water mark
+  for (u32 i = 0; i < 50; i++) {
+    u32 id = siv_push(&s, 9000 + i);
+    assert(id != SIV_INVALID);
+  }
+  assert(siv_count(&s) == 60);
+
+  // 9. Erase all
+  while (siv_count(&s) > 0) {
+    siv_erase(&s, siv_slot_id(&s, 0));
+  }
+  assert(siv_count(&s) == 0);
+
+  // 10. Reuse after full drain
+  u32 id_after = siv_push(&s, 42);
+  assert(id_after != SIV_INVALID);
+  assert(siv_count(&s) == 1);
+  assert(siv_get(&s, id_after) == 42);
+
+  siv_free(&s);
+  fprintf(stderr, "[siv] all tests passed\n");
+  return 0;
+}
diff --git a/clang/hvm4.c b/clang/hvm4.c
index f99e7016..1dec32bf 100644
--- a/clang/hvm4.c
+++ b/clang/hvm4.c
@@ -505,6 +505,7 @@ static int    PARSE_FORK_SIDE = -1;      // -1 = off, 0 = left branch (DP0), 1 =
 #include "data/wsq.c"
 #include "data/wspq.c"
 #include "data/elastic_ring.c"
+#include "data/siv.c"
 
 // CNF
 // ===
diff --git a/clang/main.c b/clang/main.c
index 2ee9e10e..4498652b 100644
--- a/clang/main.c
+++ b/clang/main.c
@@ -35,6 +35,7 @@ typedef struct {
   int   threads;
   int   verbose;
   int   test_ring;
+  int   test_siv;
   u32     ffi_loads_len;
   FfiLoad ffi_loads[FFI_MAX];
   char *file;
@@ -51,6 +52,7 @@ fn CliOpts parse_opts(int argc, char **argv) {
     .threads = 0,
     .verbose = 0,
     .test_ring = 0,
+    .test_siv = 0,
     .ffi_loads_len = 0,
     .file = NULL
   };
@@ -83,6 +85,8 @@ fn CliOpts parse_opts(int argc, char **argv) {
       opts.verbose = 1;
     } else if (strcmp(argv[i], "--test-ring") == 0) {
       opts.test_ring = 1;
+    } else if (strcmp(argv[i], "--test-siv") == 0) {
+      opts.test_siv = 1;
     } else if (strcmp(argv[i], "-d") == 0) {
       opts.debug = 1;
     } else if (strcmp(argv[i], "-D") == 0) {
@@ -145,6 +149,9 @@ int main(int argc, char **argv) {
   if (opts.test_ring) {
     return ring_test();
   }
+  if (opts.test_siv) {
+    return siv_test();
+  }
 
   if (opts.file == NULL) {
     fprintf(stderr, "Usage: ./main <file.hvm4> [-s] [-S] [-D] [-C[N]] [-T<N>] [-v] [--ffi <path>] [--ffi-dir <path>]\n");

From fb09e28687d1d2157fea61ec37dc133768dd8cfc Mon Sep 17 00:00:00 2001
From: christos chatzifountas <christos.chatzifountas@biotz.io>
Date: Fri, 13 Feb 2026 13:41:37 +0200
Subject: [PATCH 4/6] Eliminate malloc+memcpy in elastic ring grow/shrink

Leverage fd data persistence across ftruncate+remap: on growth, existing
data at fd offsets [0, old_cap) survives the extend. Only the wrapped
prefix (if any) needs a memcpy within the mapping. On shrink, compact
live data to offset 0 with a single memmove before truncating.

Removes the malloc/free allocation overhead from the resize hot path.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 clang/data/elastic_ring.c | 90 ++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 44 deletions(-)

diff --git a/clang/data/elastic_ring.c b/clang/data/elastic_ring.c
index 90f05674..ece17d5e 100644
--- a/clang/data/elastic_ring.c
+++ b/clang/data/elastic_ring.c
@@ -174,22 +174,23 @@ fn void ring_free(ElasticRing *r) {
 
 // Double the buffer capacity. Live data is preserved.
 // Returns true on success, false on failure (buffer unchanged).
+//
+// Zero-copy path: ftruncate extends the backing fd, and existing data at
+// fd offsets [0, old_cap) is preserved. After remapping at 2*cap, the live
+// data is still at its original offsets. Two cases:
+//   - No wrap (tail < head): data at [tail, head) — already contiguous, no copy.
+//   - Wrapped (tail >= head, count > 0): data at [tail, old_cap) and [0, head).
+//     After remap, move the [0, head) prefix to [old_cap, old_cap+head) via
+//     memmove within the mapping. This makes the live range [tail, old_cap+head)
+//     contiguous in the new buffer.
 fn bool ring_grow(ElasticRing *r) {
-  size_t old_cap = r->cap;
-  size_t new_cap = old_cap * 2;
-  size_t used    = r->count;
-
-  // Save live data via double-map (always contiguous from data+tail).
-  u8 *save = NULL;
-  if (used > 0) {
-    save = (u8 *)malloc(used);
-    if (!save) return false;
-    memcpy(save, r->data + r->tail, used);
-  }
+  size_t old_cap  = r->cap;
+  size_t new_cap  = old_cap * 2;
+  size_t old_head = r->head;
+  size_t old_tail = r->tail;
 
-  // Extend backing fd.
+  // Extend backing fd. Existing bytes at [0, old_cap) are preserved.
   if (ftruncate(r->fd, (off_t)new_cap) != 0) {
-    free(save);
     return false;
   }
 
@@ -204,46 +205,54 @@ fn bool ring_grow(ElasticRing *r) {
       fprintf(stderr, "elastic_ring: fatal remap failure during growth\n");
       exit(1);
     }
-    if (save) { memcpy(r->data, save, used); free(save); }
-    r->tail = 0;
-    r->head = used;
     return false;
   }
 
-  // Restore live data at position 0.
-  if (save) {
-    memcpy(r->data, save, used);
-    free(save);
-  }
-
   r->cap  = new_cap;
   r->mask = new_cap - 1;
-  r->tail = 0;
-  r->head = used;
+
+  // Unwrap if data was wrapped around the old boundary.
+  bool wrapped = (r->count > 0 && old_tail >= old_head);
+  if (wrapped) {
+    // Move the [0, old_head) prefix to [old_cap, old_cap + old_head).
+    // These fd regions don't overlap, so memcpy is safe.
+    memcpy(r->data + old_cap, r->data, old_head);
+    r->head = old_cap + old_head;
+    // tail stays the same.
+  }
+
   return true;
 }
 
 // Halve the buffer if significantly underutilized (count <= cap/4).
 // Best-effort: failure leaves the buffer unchanged.
+//
+// Before truncating the fd, compact live data into [0, count) so it fits
+// entirely within the new smaller capacity. This is a single memmove within
+// the double-mapped region (contiguous read from data+tail via ouroboros).
 fn void ring_shrink(ElasticRing *r) {
-  size_t new_cap = r->cap / 2;
+  size_t old_cap = r->cap;
+  size_t new_cap = old_cap / 2;
   if (r->count > new_cap || new_cap < r->pg) return;
 
   size_t used = r->count;
-  u8 *save = NULL;
-  if (used > 0) {
-    save = (u8 *)malloc(used);
-    if (!save) return;
-    memcpy(save, r->data + r->tail, used);
+
+  // Compact live data to fd offset 0 so it survives the truncate.
+  // memmove handles overlap (src and dst may share pages via double-map).
+  if (used > 0 && r->tail != 0) {
+    memmove(r->data, r->data + r->tail, used);
   }
+  // Now live data is at fd[0, used). Safe to truncate [new_cap, old_cap).
 
-  munmap(r->data, 2 * r->cap);
+  munmap(r->data, 2 * old_cap);
 
   if (ftruncate(r->fd, (off_t)new_cap) != 0) {
-    // Restore original.
-    ftruncate(r->fd, (off_t)r->cap);
-    r->data = (u8 *)ering_double_map(r->fd, r->cap);
-    if (save) { memcpy(r->data, save, used); free(save); }
+    // Restore original mapping. Data is already compacted at offset 0.
+    r->data = (u8 *)ering_double_map(r->fd, old_cap);
+    if (!r->data) {
+      fprintf(stderr, "elastic_ring: fatal remap failure during shrink\n");
+      exit(1);
+    }
     r->tail = 0;
     r->head = used;
     return;
@@ -251,23 +260,17 @@ fn void ring_shrink(ElasticRing *r) {
 
   r->data = (u8 *)ering_double_map(r->fd, new_cap);
   if (!r->data) {
-    ftruncate(r->fd, (off_t)r->cap);
-    r->data = (u8 *)ering_double_map(r->fd, r->cap);
+    ftruncate(r->fd, (off_t)old_cap);
+    r->data = (u8 *)ering_double_map(r->fd, old_cap);
     if (!r->data) {
       fprintf(stderr, "elastic_ring: fatal remap failure during shrink\n");
       exit(1);
     }
-    if (save) { memcpy(r->data, save, used); free(save); }
     r->tail = 0;
     r->head = used;
     return;
   }
 
-  if (save) {
-    memcpy(r->data, save, used);
-    free(save);
-  }
-
   r->cap  = new_cap;
   r->mask = new_cap - 1;
   r->tail = 0;
@@ -405,7 +408,6 @@ fn int ring_test(void) {
   }
   // Now used = n/2, head is past 3/4, tail is at 1/4.
   // Push enough to overflow → triggers growth while data wraps.
-  size_t remaining_before = ring_used(&r) / sizeof(u64);
   size_t to_push = n;  // more than avail → forces growth
   for (size_t i = 0; i < to_push; i++) assert(ring_push_u64(&r, i + 200000));
   // Verify the earlier data.

From cd072b98e85a4fcb3614c9a509eea2d37d82d6ae Mon Sep 17 00:00:00 2001
From: christos chatzifountas <christos.chatzifountas@biotz.io>
Date: Fri, 13 Feb 2026 13:46:43 +0200
Subject: [PATCH 5/6] Harden elastic ring and SIV: docs, edge cases, test
 coverage

elastic_ring.c:
- Update header comment to reflect zero-copy growth (was stale)
- Add thread-safety notes for future MT integration
- Document byte-level alignment contract
- Enforce cap/4 threshold in ring_shrink (was documented but not checked)
- Add tests: shrink with wrapped data, cap/4 threshold rejection,
  grow when empty, direct double-map mirror verification

siv.c:
- Document u32 ID space exhaustion (4B push limit)
- Add thread-safety contract for SIV+Ring thief pattern
  (release/acquire barriers, erase-during-read race)
- Add tests: garbage ID rejection (SIV_INVALID, near-max, beyond
  next_id, erased), independent id_cap vs data_cap growth across
  3 push-erase rounds, erase-last-element (no-swap path)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 clang/data/elastic_ring.c | 78 +++++++++++++++++++++++++++++++++++----
 clang/data/siv.c          | 51 ++++++++++++++++++++++++-
 2 files changed, 120 insertions(+), 9 deletions(-)

diff --git a/clang/data/elastic_ring.c b/clang/data/elastic_ring.c
index ece17d5e..1dca0b48 100644
--- a/clang/data/elastic_ring.c
+++ b/clang/data/elastic_ring.c
@@ -5,21 +5,29 @@
 // - Same physical memory is mapped twice contiguously in virtual address space,
 //   so reads/writes that cross the buffer boundary are seamless (no split logic).
 // - Elastic: grows via ftruncate + remap, shrinks via ftruncate + remap.
-//   Data is preserved through the backing fd; growth copies only live data.
+//   The backing fd preserves data across remaps — growth is zero-copy when the
+//   live data doesn't wrap, or a single memcpy of the wrapped prefix otherwise.
 //
 // Design
 // - Linux: memfd_create for anonymous backing fd.
 // - macOS/POSIX: shm_open + immediate shm_unlink for anonymous backing fd.
 // - Double-map: reserve 2*cap virtual space, MAP_FIXED both halves to same fd.
 // - Modular head/tail indices in [0, cap). Separate count for full/empty.
-// - Growth: save live data (contiguous via double-map), extend fd, remap, restore.
-// - Shrink: same approach in reverse, halving capacity.
+// - Growth: ftruncate to 2*cap, remap, unwrap prefix if wrapped. Zero-copy
+//   when data doesn't wrap (common case for monotonic push/pop patterns).
+// - Shrink: compact live data to offset 0, ftruncate to cap/2, remap.
 //
 // Notes
 // - Single-threaded (owner only). Concurrency is handled at a higher layer.
+//   For multi-threaded use (e.g. Chase-Lev backing): grow/shrink must not race
+//   with any readers. Thieves holding pointers from ring_pop_ptr become invalid
+//   after grow/shrink tears down the mapping. An atomic (data, mask) pair or
+//   quiescence protocol is needed — see wsq.c WsqArray pattern.
 // - Capacity is always a power-of-two multiple of page size.
 // - ring_push_ptr / ring_pop_ptr return pointers valid for contiguous access
 //   up to (cap - count) and count bytes respectively, even across the boundary.
+// - The byte-level API assumes callers handle alignment. The u64 convenience
+//   functions are always aligned since sizeof(u64) divides the page size.
 
 #include <sys/mman.h>
 #include <unistd.h>
@@ -233,7 +241,9 @@ fn bool ring_grow(ElasticRing *r) {
 fn void ring_shrink(ElasticRing *r) {
   size_t old_cap = r->cap;
   size_t new_cap = old_cap / 2;
+  // Only shrink if usage is at most 25% of capacity and new cap is viable.
   if (r->count > new_cap || new_cap < r->pg) return;
+  if (r->count > old_cap / 4) return;
 
   size_t used = r->count;
 
@@ -422,10 +432,64 @@ fn int ring_test(void) {
   }
   assert(ring_used(&r) == 0);
 
-  // 8. Shrink.
-  assert(ring_push_u64(&r, 12345));
-  ring_shrink(&r);
-  assert(ring_pop_u64(&r, &v) && v == 12345);
+  // 8. Shrink with wrapped data: advance head/tail to 3/4, push a few
+  //    elements so data wraps, then shrink. The memmove should compact
+  //    the wrapped data to offset 0 before truncating.
+  {
+    size_t sn = r.cap / sizeof(u64);
+    // Advance to 3/4 position.
+    for (size_t i = 0; i < sn * 3 / 4; i++) assert(ring_push_u64(&r, 0xAA));
+    for (size_t i = 0; i < sn * 3 / 4; i++) assert(ring_pop_u64(&r, &v));
+    // Push a few — head wraps past cap.
+    u32 few = 4;
+    for (u32 i = 0; i < few; i++) assert(ring_push_u64(&r, 55555 + i));
+    // Now count is small, cap is large — shrink should fire.
+    size_t cap_before = r.cap;
+    ring_shrink(&r);
+    assert(r.cap < cap_before);
+    // Verify data survived the shrink.
+    for (u32 i = 0; i < few; i++) {
+      assert(ring_pop_u64(&r, &v));
+      assert(v == 55555 + i);
+    }
+    assert(ring_used(&r) == 0);
+  }
+
+  // 9. Shrink respects cap/4 threshold: don't shrink if usage > cap/4.
+  {
+    size_t sn = r.cap / sizeof(u64);
+    // Fill to 30% capacity — above the 25% threshold.
+    size_t fill = sn * 30 / 100;
+    if (fill == 0) fill = 1;
+    for (size_t i = 0; i < fill; i++) assert(ring_push_u64(&r, i));
+    size_t cap_before = r.cap;
+    ring_shrink(&r);
+    assert(r.cap == cap_before); // should NOT have shrunk
+    for (size_t i = 0; i < fill; i++) assert(ring_pop_u64(&r, &v));
+  }
+
+  // 10. Grow when empty — should be a cheap no-data-copy expansion.
+  {
+    size_t cap_before = r.cap;
+    assert(ring_grow(&r));
+    assert(r.cap == cap_before * 2);
+    assert(ring_used(&r) == 0);
+    // Push/pop still works after empty grow.
+    assert(ring_push_u64(&r, 99999));
+    assert(ring_pop_u64(&r, &v) && v == 99999);
+  }
+
+  // 11. Double-map verification: write at offset X, read at offset X+cap.
+  //     This directly tests the ouroboros property of the backing mapping.
+  {
+    assert(ring_used(&r) == 0);
+    size_t off = 128; // arbitrary offset within the first half
+    if (off + sizeof(u64) <= r.cap) {
+      *(u64 *)(r.data + off) = 0xDEADBEEFCAFEull;
+      u64 mirror = *(u64 *)(r.data + off + r.cap);
+      assert(mirror == 0xDEADBEEFCAFEull);
+    }
+  }
 
   ring_free(&r);
   fprintf(stderr, "[elastic_ring] all tests passed\n");
diff --git a/clang/data/siv.c b/clang/data/siv.c
index 9e00d75a..30a803e3 100644
--- a/clang/data/siv.c
+++ b/clang/data/siv.c
@@ -16,8 +16,19 @@
 // Notes
 // - Single-threaded (owner only). For work-stealing, the owner pushes IDs into
 //   a concurrent ring; thieves read SIV data via the ID after stealing.
-// - ID space is monotonic: id_to_slot[] grows with total pushes (not live count).
-//   Separate capacity tracking for data[] (count-bound) and id_to_slot[] (id-bound).
+//   Thread-safety contract for SIV+Ring integration:
+//     * Owner: siv_push (then publish ID to ring), siv_erase (after thief is done).
+//     * Thief: siv_valid + siv_get (after stealing ID from ring).
+//     * Requires: release barrier after siv_push (before ring push), acquire
+//       barrier in thief (after ring steal, before siv_get).
+//     * Race: if owner calls siv_erase while thief is in siv_get for the same ID,
+//       the thief may read a stale/swapped value. The thief must copy the value
+//       before the owner can erase, or the cancel protocol must ensure the owner
+//       only erases IDs that no thief is currently reading.
+// - ID space is monotonic u32: wraps to 0 after 4 billion pushes total (not live).
+//   For HVM4 work queues this is unreachable in practice. If needed, a reset or
+//   generation counter can be added to the protocol.
+// - Separate capacity tracking for data[] (count-bound) and id_to_slot[] (id-bound).
 // - Erase of an already-erased ID is a safe no-op.
 
 #include <stdlib.h>
@@ -238,6 +249,42 @@ fn int siv_test(void) {
   assert(id_after != SIV_INVALID);
   assert(siv_count(&s) == 1);
   assert(siv_get(&s, id_after) == 42);
+  siv_erase(&s, id_after);
+
+  // 11. siv_valid rejects garbage IDs
+  assert(!siv_valid(&s, SIV_INVALID));     // sentinel value
+  assert(!siv_valid(&s, 0xFFFFFFFE));      // near-max u32
+  assert(!siv_valid(&s, 999999));          // beyond next_id
+  assert(!siv_valid(&s, 0));               // was valid, now erased
+
+  // 12. Independent capacity growth: data cap vs id cap.
+  //     Push many, erase all, push many again. id_cap grows with total pushes
+  //     while data cap stays small (because count is always low).
+  for (u32 round = 0; round < 3; round++) {
+    for (u32 i = 0; i < 100; i++) {
+      u32 id = siv_push(&s, 8000 + round * 100 + i);
+      assert(id != SIV_INVALID);
+    }
+    // Erase all — data cap doesn't need to grow, but id_cap keeps rising
+    while (siv_count(&s) > 0) {
+      siv_erase(&s, siv_slot_id(&s, 0));
+    }
+  }
+  // After 3 rounds of 100 push+erase, next_id is 300+ but count is 0.
+  // id_cap must have grown, data cap may not have.
+  assert(siv_count(&s) == 0);
+  assert(s.id_cap >= 300);
+  // Push one more to verify everything still works
+  u32 id_final = siv_push(&s, 77777);
+  assert(id_final != SIV_INVALID);
+  assert(siv_get(&s, id_final) == 77777);
+
+  // 13. Erase last element (slot == last, no swap needed)
+  u32 id_only = siv_push(&s, 88888);
+  // Now count=2. Erase id_only (it's at the last slot).
+  siv_erase(&s, id_only);
+  assert(siv_count(&s) == 1);
+  assert(siv_get(&s, id_final) == 77777); // earlier push still valid
 
   siv_free(&s);
   fprintf(stderr, "[siv] all tests passed\n");

From 29eee08cda5970269d7f694b5b12d0dcbcca3bb5 Mon Sep 17 00:00:00 2001
From: christos chatzifountas <christos.chatzifountas@biotz.io>
Date: Fri, 13 Feb 2026 20:30:01 +0200
Subject: [PATCH 6/6] docs: Add GC soundness proof (ref counting completeness
 for IC/DAG heap)

---
 docs/GC_SOUNDNESS.md | 123 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 docs/GC_SOUNDNESS.md

diff --git a/docs/GC_SOUNDNESS.md b/docs/GC_SOUNDNESS.md
new file mode 100644
index 00000000..245c1118
--- /dev/null
+++ b/docs/GC_SOUNDNESS.md
@@ -0,0 +1,123 @@
+# GC Soundness: Why Ref Counting is Complete for HVM4
+
+This document proves that reference counting alone is sufficient for complete garbage collection in HVM4, without requiring cycle detection or tracing GC.
+
+## Core Claim
+
+**Theorem**: The HVM4 heap is always a DAG (Directed Acyclic Graph). Therefore, reference counting is complete — every unreachable node will eventually have refcount 0.
+
+## Background
+
+Traditional ref counting fails on cycles:
+```
+A → B → A   // Both have refcount=1 forever, leaked
+```
+
+Tracing GC solves this by periodically walking the entire heap to find unreachable cycles. This introduces pause times and complexity.
+
+HVM4 avoids this entirely: **cycles are structurally impossible**.
+
+## Proof
+
+### Lemma 1: Allocation Order
+
+Every node in the HVM4 heap is allocated at a monotonically increasing address (or timestamp). Call this the node's *birth time* `t(n)`.
+
+### Lemma 2: Reference Direction
+
+When node A references node B, we have `t(B) < t(A)`. In other words, **nodes can only reference previously-allocated nodes**.
+
+*Proof*: In IC reduction:
+- `@name` references a statically-defined node (birth time 0)
+- Lambda application `f(x)` creates a new node referencing existing `f` and `x`
+- DUP creates a SUP node referencing the original (older) node
+- No operation creates a reference to a "future" node
+
+### Lemma 3: No Self-Reference
+
+A node cannot reference itself: `t(A) < t(A)` is a contradiction.
+
+### Theorem: DAG Property
+
+**Proof by contradiction**: Assume a cycle exists: `A₁ → A₂ → ... → Aₙ → A₁`
+
+By Lemma 2:
+- `t(A₁) > t(A₂)` (A₁ references A₂)
+- `t(A₂) > t(A₃)`
+- ...
+- `t(Aₙ) > t(A₁)`
+
+Chaining these: `t(A₁) > t(A₂) > ... > t(Aₙ) > t(A₁)`
+
+This implies `t(A₁) > t(A₁)`, a contradiction. ∎
+
+### Corollary: Ref Counting is Complete
+
+In a DAG:
+1. If a node is unreachable from roots, there exists a topological ordering where it can be freed
+2. When a node's refcount hits 0, all nodes it references can have their refcounts decremented
+3. This cascades through the DAG until all unreachable nodes are freed
+
+No cycle can "protect" unreachable nodes from collection.
+
+## What About Recursion?
+
+Recursive definitions like the Y combinator don't create heap cycles:
+
+```hvm4
+@Y = λ&f. f(@Y(f))
+```
+
+Each recursive call allocates a *new* thunk:
+```
+t=0: Y defined
+t=1: @Y(f) called → new thunk T₁ referencing f (t < 1)
+t=2: T₁ reduces, calls @Y(f) → new thunk T₂ referencing f
+...
+```
+
+The chain `T₁ → f`, `T₂ → f`, etc. forms a tree (or DAG), not a cycle. The "infinite recursion" is infinite *unfolding*, not circular reference.
+
+## What About DUP/SUP?
+
+Duplication creates explicit sharing via superposition:
+
+```hvm4
+!&x = expensive_computation;
+[x, x]  // x used twice
+```
+
+This creates:
+```
+SUP_node → expensive_computation
+result_list → SUP_node (twice)
+```
+
+The SUP node references the *original* computation (older). When both uses of `x` are consumed, SUP's refcount drops to 0, then the original's refcount decrements.
+
+## Epoch Allocator
+
+The epoch-based allocator leverages this guarantee:
+
+1. **Epoch N**: Allocate nodes freely
+2. **Epoch N+1**: Any node from epoch N with refcount=0 is bulk-freed
+
+No scanning, no marking, no tracing. Just batched refcount checks.
+
+## FFI Considerations
+
+The DAG guarantee holds for pure HVM4 code. External FFI with mutable state requires care:
+- FFI-allocated objects should be wrapped with explicit ref management
+- Or use epoch pinning to prevent premature collection
+
+## Conclusion
+
+HVM4's interaction combinator semantics structurally guarantee a DAG heap. This is not a runtime property to be checked — it's an invariant maintained by the reduction rules themselves.
+
+**Reference counting + epoch batching = complete, pauseless GC.**
+
+## References
+
+- Lamping, J. (1990). An algorithm for optimal lambda calculus reduction
+- Asperti, A., & Guerrini, S. (1998). The optimal implementation of functional programming languages
+- Levy, J. J. (1980). Optimal reductions in the lambda calculus