From 1534e80c3300b41ca15755d8ff74c0ce5f2a64bd Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Tue, 31 Mar 2026 11:39:31 -0700
Subject: [PATCH 01/14] make bytecode generic for 32 and 64 bit word size

---
 source/pip/qsharp/_adaptive_pass.py           |  25 ++-
 source/pip/qsharp/_simulation.py              |   8 +-
 source/pip/src/qir_simulation.rs              | 128 ++++++-------
 .../pip/src/qir_simulation/gpu_full_state.rs  |   7 +-
 source/pip/tests/test_adaptive_pass.py        |   4 +-
 source/simulators/src/bytecode.rs             | 177 ++++++++----------
 .../src/gpu_full_state_simulator.rs           |   2 +-
 .../gpu_full_state_simulator/gpu_context.rs   |  24 ++-
 .../gpu_full_state_simulator/shader_types.rs  |  57 ++++++
 9 files changed, 236 insertions(+), 196 deletions(-)

diff --git a/source/pip/qsharp/_adaptive_pass.py b/source/pip/qsharp/_adaptive_pass.py
index 81c7c4b6f6..d7f11fdeaa 100644
--- a/source/pip/qsharp/_adaptive_pass.py
+++ b/source/pip/qsharp/_adaptive_pass.py
@@ -12,11 +12,18 @@
 
 from __future__ import annotations
 from dataclasses import dataclass, astuple
+from enum import Enum
 import pyqir
 import struct
 from typing import Any, Dict, List, Optional, Tuple, TypeAlias, cast
 from ._adaptive_bytecode import *
 
+
+class Bytecode(Enum):
+    Bit32 = 1
+    Bit64 = 2
+
+
 # ---------------------------------------------------------------------------
 # Gate name → OpID mapping (must match shader_types.rs OpID enum)
 # ---------------------------------------------------------------------------
@@ -201,8 +208,8 @@ def __post_init__(self):
 
 
 class FloatOperand:
-    def __init__(self, val: float = 0.0) -> None:
-        self.val: int = encode_float_as_bits(val)
+    def __init__(self, val: float, bytecode_kind: Bytecode) -> None:
+        self.val: int = encode_float_as_bits(val, bytecode_kind)
 
 
 @dataclass
@@ -255,14 +262,17 @@ def unwrap_operands(
     return (dst, src0, src1, aux0, aux1, aux2, aux3)
 
 
-def encode_float_as_bits(val: float) -> int:
-    return struct.unpack("<I", struct.pack("<f", val))[0]
+def encode_float_as_bits(val: float, bytecode_kind: Bytecode) -> int:
+    if bytecode_kind == Bytecode.Bit32:
+        return struct.unpack("<I", struct.pack("<f", val))[0]
+    else:
+        return struct.unpack("<Q", struct.pack("<d", val))[0]
 
 
 class AdaptiveProfilePass:
     """Walks Adaptive Profile QIR and emits the intermediate format for Rust."""
 
-    def __init__(self):
+    def __init__(self, bytecode_kind: Bytecode):
         # Output tables.
         self.blocks: List[Block] = []
         self.instructions: List[Instruction] = []
@@ -275,6 +285,7 @@ def __init__(self):
         self.register_types: List[RegisterType] = []
 
         # Internal tracking.
+        self._bytecode_kind = bytecode_kind
         self._next_reg: int = 0
         self._next_block: int = 0
         self._next_qop: int = 0
@@ -429,7 +440,7 @@ def _resolve_operand(self, value: pyqir.Value) -> IntOperand | FloatOperand | Re
 
         if isinstance(value, pyqir.FloatConstant):
             val = value.value
-            return FloatOperand(val)
+            return FloatOperand(val, self._bytecode_kind)
 
         # Forward reference (e.g. phi incoming from a later block).
         # Pre-allocate a register; the defining instruction will reuse it
@@ -752,7 +763,7 @@ def _emit_quantum_call(self, call: pyqir.Call) -> None:
             angle = self._resolve_angle_operand(call.args[0])
         else:
             qubit_arg_offset = 0
-            angle = FloatOperand()
+            angle = FloatOperand(0.0, self._bytecode_kind)
         qubit_arg_offset = 1 if gate_name in ROTATION_GATES else 0
         q1, q2, q3 = self._resolve_qubit_operands(call.args[qubit_arg_offset:])
         qop_idx = self._emit_quantum_op(op_id, q1.val, q2.val, q3.val, angle.val)
diff --git a/source/pip/qsharp/_simulation.py b/source/pip/qsharp/_simulation.py
index 084b7ca625..f237e46278 100644
--- a/source/pip/qsharp/_simulation.py
+++ b/source/pip/qsharp/_simulation.py
@@ -25,7 +25,7 @@
 )
 from ._qsharp import QirInputData, Result
 from typing import TYPE_CHECKING
-from ._adaptive_pass import AdaptiveProfilePass, OP_RECORD_OUTPUT
+from ._adaptive_pass import AdaptiveProfilePass, Bytecode, OP_RECORD_OUTPUT
 
 if TYPE_CHECKING:  # This is in the pyi file only
     from ._native import GpuShotResults
@@ -551,7 +551,7 @@ def run_qir_gpu(
     # Ccx is not support in the GPU simulator, decompose it
     DecomposeCcxPass().run(mod)
     if is_adaptive(mod):
-        program = AdaptiveProfilePass().run(mod, noise)
+        program = AdaptiveProfilePass(Bytecode.Bit32).run(mod, noise)
         results = run_adaptive_parallel_shots(program.as_dict(), shots, noise, seed)
 
         # Extract recorded output result indices from the bytecode.
@@ -646,7 +646,9 @@ def set_program(self, input: Union[QirInputData, str, bytes]):
             noise_intrinsics = None
             if self.tables is not None:
                 noise_intrinsics = {name: table_id for table_id, name, _ in self.tables}
-            program = AdaptiveProfilePass().run(mod, noise_intrinsics=noise_intrinsics)
+            program = AdaptiveProfilePass(Bytecode.Bit32).run(
+                mod, noise_intrinsics=noise_intrinsics
+            )
             self.gpu_context.set_adaptive_program(program.as_dict())
 
             # Extract recorded output result indices from the bytecode.
diff --git a/source/pip/src/qir_simulation.rs b/source/pip/src/qir_simulation.rs
index 6e41c2da97..db416cdd41 100644
--- a/source/pip/src/qir_simulation.rs
+++ b/source/pip/src/qir_simulation.rs
@@ -7,7 +7,7 @@ pub(crate) mod gpu_full_state;
 
 use crate::qir_simulation::correlated_noise::parse_noise_table;
 
-use num_traits::Float;
+use num_traits::{Float, Unsigned};
 use pyo3::{
     Bound, FromPyObject, Py, PyRef, PyResult, Python,
     exceptions::{PyAttributeError, PyKeyError, PyTypeError, PyValueError},
@@ -752,89 +752,69 @@ fn from_intrinsics_table_ref<T: Float>(
         .collect()
 }
 
-fn pydict_to_adaptive_program(program: &Bound<'_, PyDict>) -> PyResult<bytecode::AdaptiveProgram> {
-    use bytecode::{AdaptiveProgram, Block, Function, Instruction, PhiNodeEntry, SwitchCase};
+fn extract_key<T>(dict: &Bound<'_, PyDict>, key: &'static str) -> PyResult<T>
+where
+    T: for<'a, 'py> FromPyObject<'a, 'py, Error = pyo3::PyErr>,
+{
     use pyo3::types::PyDictMethods;
+    dict.get_item(key)?
+        .ok_or_else(move || PyKeyError::new_err(key))?
+        .extract()
+}
+
+fn adaptive_program_from_pydict<Word>(
+    dict: &Bound<'_, PyDict>,
+) -> PyResult<bytecode::AdaptiveProgram<Word>>
+where
+    Word: Unsigned + Default + for<'a, 'py> FromPyObject<'a, 'py, Error = pyo3::PyErr>,
+{
+    use bytecode::{AdaptiveProgram, Block, Function, Instruction, Op, PhiNodeEntry, SwitchCase};
+
+    type BlockTuple<W> = (W, W, W);
+    type InsTuple<W> = (W, W, W, W, W, W, W, W);
+    type OpTuple<W> = (W, W, W, W, f64);
+    type FunTuple<W> = (W, W, W);
+    type PhiTuple<W> = (W, W);
+    type SwitchTuple<W> = (W, W);
+
+    let num_qubits: u32 = extract_key(dict, "num_qubits")?;
+    let num_results: u32 = extract_key(dict, "num_results")?;
+    let num_registers: u32 = extract_key(dict, "num_registers")?;
+    let entry_block: Word = extract_key(dict, "entry_block")?;
+
+    let instructions = extract_key::<Vec<InsTuple<Word>>>(dict, "instructions")?
+        .into_iter()
+        .map(Instruction::from_tuple)
+        .collect();
 
-    // Extract scalar fields
-    let num_qubits: u32 = program
-        .get_item("num_qubits")?
-        .ok_or_else(|| PyKeyError::new_err("num_qubits"))?
-        .extract()?;
-    let num_results: u32 = program
-        .get_item("num_results")?
-        .ok_or_else(|| PyKeyError::new_err("num_results"))?
-        .extract()?;
-    let num_registers: u32 = program
-        .get_item("num_registers")?
-        .ok_or_else(|| PyKeyError::new_err("num_registers"))?
-        .extract()?;
-    let entry_block: u32 = program
-        .get_item("entry_block")?
-        .ok_or_else(|| PyKeyError::new_err("entry_block"))?
-        .extract()?;
-
-    // Extract array fields
-    let blocks: Vec<(u32, u32, u32)> = program
-        .get_item("blocks")?
-        .ok_or_else(|| PyKeyError::new_err("blocks"))?
-        .extract()?;
-    #[allow(clippy::type_complexity)]
-    let instructions: Vec<(u32, u32, u32, u32, u32, u32, u32, u32)> = program
-        .get_item("instructions")?
-        .ok_or_else(|| PyKeyError::new_err("instructions"))?
-        .extract()?;
-    let quantum_ops_raw: Vec<(u32, u32, u32, u32, f64)> = program
-        .get_item("quantum_ops")?
-        .ok_or_else(|| PyKeyError::new_err("quantum_ops"))?
-        .extract()?;
-    let functions: Vec<(u32, u32, u32)> = program
-        .get_item("functions")?
-        .ok_or_else(|| PyKeyError::new_err("functions"))?
-        .extract()?;
-    let phi_entries: Vec<(u32, u32)> = program
-        .get_item("phi_entries")?
-        .ok_or_else(|| PyKeyError::new_err("phi_entries"))?
-        .extract()?;
-    let switch_cases: Vec<(u32, u32)> = program
-        .get_item("switch_cases")?
-        .ok_or_else(|| PyKeyError::new_err("switch_cases"))?
-        .extract()?;
-    let mut call_args: Vec<u32> = program
-        .get_item("call_args")?
-        .ok_or_else(|| PyKeyError::new_err("call_args"))?
-        .extract()?;
-
-    // Build quantum Op pool using existing gate constructors
-    let quantum_ops = bytecode::build_op_pool(&quantum_ops_raw);
-
-    // Convert instructions to Instruction structs
-    let bytecode: Vec<Instruction> = instructions
-        .iter()
-        .map(|t| Instruction::from_tuple(*t))
+    let quantum_ops = extract_key::<Vec<OpTuple<Word>>>(dict, "quantum_ops")?
+        .into_iter()
+        .map(Op::from_tuple)
         .collect();
 
-    // Convert block table: strip block_id and pred_count, keep (instr_offset, instr_count)
-    let mut block_table: Vec<Block> = blocks
-        .iter()
-        .map(|&(_block_id, instr_offset, instr_count)| (instr_offset, instr_count))
+    let mut block_table = extract_key::<Vec<BlockTuple<Word>>>(dict, "blocks")?
+        .into_iter()
+        .map(|(_block_id, instr_offset, instr_count)| (instr_offset, instr_count))
         .map(Block::from_tuple)
         .collect();
 
-    // Convert function table
-    let mut function_table: Vec<Function> =
-        functions.iter().map(|&t| Function::from_tuple(t)).collect();
+    let mut function_table = extract_key::<Vec<FunTuple<Word>>>(dict, "functions")?
+        .into_iter()
+        .map(Function::from_tuple)
+        .collect();
 
-    // Convert phi entries and switch cases
-    let mut phi_entries: Vec<PhiNodeEntry> = phi_entries
-        .iter()
-        .map(|&t| PhiNodeEntry::from_tuple(t))
+    let mut phi_entries = extract_key::<Vec<PhiTuple<Word>>>(dict, "phi_entries")?
+        .into_iter()
+        .map(PhiNodeEntry::from_tuple)
         .collect();
-    let mut switch_cases: Vec<SwitchCase> = switch_cases
-        .iter()
-        .map(|&t| SwitchCase::from_tuple(t))
+
+    let mut switch_cases = extract_key::<Vec<SwitchTuple<Word>>>(dict, "switch_cases")?
+        .into_iter()
+        .map(SwitchCase::from_tuple)
         .collect();
 
+    let mut call_args = extract_key::<Vec<Word>>(dict, "call_args")?;
+
     // WebGPU requires that arrays have at least one element,
     // so, we push a dummy element on each of these arrays if they are empty.
     push_default_if_empty(&mut block_table);
@@ -844,7 +824,7 @@ fn pydict_to_adaptive_program(program: &Bound<'_, PyDict>) -> PyResult<bytecode:
     push_default_if_empty(&mut call_args);
 
     Ok(AdaptiveProgram {
-        instructions: bytecode,
+        instructions,
         block_table,
         function_table,
         quantum_ops,
diff --git a/source/pip/src/qir_simulation/gpu_full_state.rs b/source/pip/src/qir_simulation/gpu_full_state.rs
index d548c96a0b..fc30a822ca 100644
--- a/source/pip/src/qir_simulation/gpu_full_state.rs
+++ b/source/pip/src/qir_simulation/gpu_full_state.rs
@@ -2,7 +2,8 @@
 // Licensed under the MIT License.
 
 use crate::qir_simulation::{
-    NoiseConfig, QirInstruction, QirInstructionId, pydict_to_adaptive_program, unbind_noise_config,
+    NoiseConfig, QirInstruction, QirInstructionId, adaptive_program_from_pydict,
+    unbind_noise_config,
 };
 use pyo3::{
     IntoPyObjectExt, PyResult,
@@ -254,7 +255,7 @@ impl GpuContext {
 
         gpu_context.swith_to_adaptive();
 
-        let adaptive_program = pydict_to_adaptive_program(program)?;
+        let adaptive_program = adaptive_program_from_pydict(program)?;
         let num_results = adaptive_program.num_results;
 
         gpu_context
@@ -418,7 +419,7 @@ pub fn run_adaptive_parallel_shots<'py>(
 ) -> PyResult<Py<PyAny>> {
     let noise = noise_config.map(|noise_config| unbind_noise_config(py, noise_config));
     let rng_seed = seed.unwrap_or(0xfeed_face);
-    let program = pydict_to_adaptive_program(input)?;
+    let program = adaptive_program_from_pydict(input)?;
     let result_count: usize = program.num_results as usize;
     let sim_results = qdk_simulators::run_adaptive_shots_sync(program, &noise, shots, rng_seed, 0)
         .map_err(PyRuntimeError::new_err)?;
diff --git a/source/pip/tests/test_adaptive_pass.py b/source/pip/tests/test_adaptive_pass.py
index 8e1d0b89fa..b8df0dae4f 100644
--- a/source/pip/tests/test_adaptive_pass.py
+++ b/source/pip/tests/test_adaptive_pass.py
@@ -12,7 +12,7 @@
 import pyqir
 import pytest
 
-from qsharp._adaptive_pass import AdaptiveProfilePass, AdaptiveProgram
+from qsharp._adaptive_pass import AdaptiveProfilePass, AdaptiveProgram, Bytecode
 from qsharp._adaptive_bytecode import *
 
 
@@ -24,7 +24,7 @@
 def _run_pass(ir: str, name: str = "test.ll") -> AdaptiveProgram:
     """Parse an LLVM IR string and run through AdaptiveProfilePass."""
     mod = pyqir.Module.from_ir(pyqir.Context(), ir, name)
-    return AdaptiveProfilePass().run(mod)
+    return AdaptiveProfilePass(Bytecode.Bit32).run(mod)
 
 
 def _primary(opcode_word: int) -> int:
diff --git a/source/simulators/src/bytecode.rs b/source/simulators/src/bytecode.rs
index 8c2e4e066a..6fb246e857 100644
--- a/source/simulators/src/bytecode.rs
+++ b/source/simulators/src/bytecode.rs
@@ -6,12 +6,18 @@
 //! Values must stay in sync with the Python `_adaptive_opcodes.py` module.
 
 use bytemuck::{Pod, Zeroable};
+use num_traits::Unsigned;
 
-use crate::shader_types::{Op, ops};
+// We need these for uploading data to the GPU.
+unsafe impl Pod for Instruction<u32> {}
+unsafe impl Pod for Block<u32> {}
+unsafe impl Pod for Function<u32> {}
+unsafe impl Pod for PhiNodeEntry<u32> {}
+unsafe impl Pod for SwitchCase<u32> {}
 
 /// Stores a parsed adaptive program.
 #[derive(Debug)]
-pub struct AdaptiveProgram {
+pub struct AdaptiveProgram<Word: Unsigned> {
     /// Number of qubits used by the program.
     pub num_qubits: u32,
     /// Number of result registers used by the program.
@@ -19,21 +25,21 @@ pub struct AdaptiveProgram {
     /// Number of virtual registers used by the program.
     pub num_registers: u32,
     /// Entry block ID for the program.
-    pub entry_block: u32,
+    pub entry_block: Word,
     /// Bytecode instructions.
-    pub instructions: Vec<Instruction>,
+    pub instructions: Vec<Instruction<Word>>,
     /// Block table: indexed by block ID.
-    pub block_table: Vec<Block>,
+    pub block_table: Vec<Block<Word>>,
     /// Function table.
-    pub function_table: Vec<Function>,
+    pub function_table: Vec<Function<Word>>,
     /// Phi side table: `[predecessor_block_id, value_register]` entries.
-    pub phi_entries: Vec<PhiNodeEntry>,
+    pub phi_entries: Vec<PhiNodeEntry<Word>>,
     /// Switch side table: `[match_value, target_block]` entries.
-    pub switch_cases: Vec<SwitchCase>,
+    pub switch_cases: Vec<SwitchCase<Word>>,
     /// Call argument register indices.
-    pub call_args: Vec<u32>,
+    pub call_args: Vec<Word>,
     /// Quantum op pool (full `Op` structs with expanded unitaries).
-    pub quantum_ops: Vec<Op>,
+    pub quantum_ops: Vec<Op<Word>>,
 }
 
 // ---------------------------------------------------------------------------
@@ -48,28 +54,29 @@ pub struct AdaptiveProgram {
 /// - `src0`, `src1`: source registers or immediates
 /// - `aux0`–`aux3`: auxiliary fields (gate index, block ids, side-table offsets, etc.)
 #[repr(C)]
-#[derive(Copy, Clone, Debug, Default, Pod, Zeroable)]
-pub struct Instruction {
-    pub opcode: u32,
-    pub dst: u32,
-    pub src0: u32,
-    pub src1: u32,
-    pub aux0: u32,
-    pub aux1: u32,
-    pub aux2: u32,
-    pub aux3: u32,
+#[derive(Copy, Clone, Debug, Default, Zeroable)]
+pub struct Instruction<Word> {
+    pub opcode: Word,
+    pub dst: Word,
+    pub src0: Word,
+    pub src1: Word,
+    pub aux0: Word,
+    pub aux1: Word,
+    pub aux2: Word,
+    pub aux3: Word,
 }
 
-const _: () = assert!(std::mem::size_of::<Instruction>() == 32);
+const _: () = assert!(std::mem::size_of::<Instruction<u32>>() == 32);
+const _: () = assert!(std::mem::size_of::<Instruction<u64>>() == 64);
 
 // ---------------------------------------------------------------------------
 // Helper functions
 // ---------------------------------------------------------------------------
 
-impl Instruction {
+impl<Word> Instruction<Word> {
     /// Create an [`Instruction`] from an 8-tuple (matching Python emission format).
     #[must_use]
-    pub const fn from_tuple(t: (u32, u32, u32, u32, u32, u32, u32, u32)) -> Self {
+    pub fn from_tuple(t: (Word, Word, Word, Word, Word, Word, Word, Word)) -> Self {
         Self {
             opcode: t.0,
             dst: t.1,
@@ -81,7 +88,9 @@ impl Instruction {
             aux3: t.7,
         }
     }
+}
 
+impl Instruction<u64> {
     /// Extract the primary opcode (bits [7:0]).
     #[must_use]
     pub const fn primary_opcode(&self) -> u8 {
@@ -102,23 +111,23 @@ impl Instruction {
 
     /// Check whether a specific flag bit is set.
     #[must_use]
-    pub const fn has_flag(&self, flag: u32) -> bool {
+    pub const fn has_flag(&self, flag: u64) -> bool {
         self.opcode & flag != 0
     }
 }
 
 /// A basic block descriptor.
 #[repr(C)]
-#[derive(Copy, Clone, Debug, Default, Pod, Zeroable)]
-pub struct Block {
-    pub instr_offset: u32,
-    pub instr_count: u32,
+#[derive(Copy, Clone, Debug, Default, Zeroable)]
+pub struct Block<Word> {
+    pub instr_offset: Word,
+    pub instr_count: Word,
 }
 
-impl Block {
+impl<Word> Block<Word> {
     /// Create a [`Block`] from an 2-tuple (matching Python emission format).
     #[must_use]
-    pub const fn from_tuple(t: (u32, u32)) -> Self {
+    pub fn from_tuple(t: (Word, Word)) -> Self {
         Self {
             instr_offset: t.0,
             instr_count: t.1,
@@ -133,39 +142,39 @@ impl Block {
 /// The `reserved` field pads the struct to 16 bytes so it matches
 /// the GPU shader layout (`vec4<u32>`).
 #[repr(C)]
-#[derive(Copy, Clone, Debug, Default, Pod, Zeroable)]
-pub struct Function {
-    pub entry_block_id: u32,
-    pub param_count: u32,
-    pub param_base_reg: u32,
-    pub reserved: u32,
+#[derive(Copy, Clone, Debug, Default, Zeroable)]
+pub struct Function<Word> {
+    pub entry_block_id: Word,
+    pub param_count: Word,
+    pub param_base_reg: Word,
+    pub reserved: Word,
 }
 
-impl Function {
+impl<Word: Default> Function<Word> {
     /// Create a [`Function`] from a 3-tuple (matching Python emission format).
     #[must_use]
-    pub const fn from_tuple(t: (u32, u32, u32)) -> Self {
+    pub fn from_tuple(t: (Word, Word, Word)) -> Self {
         Self {
             entry_block_id: t.0,
             param_count: t.1,
             param_base_reg: t.2,
-            reserved: 0,
+            reserved: Word::default(),
         }
     }
 }
 
 /// A component of a phi node.
 #[repr(C)]
-#[derive(Copy, Clone, Debug, Default, Pod, Zeroable)]
-pub struct PhiNodeEntry {
-    block_id: u32,
-    val_reg: u32,
+#[derive(Copy, Clone, Debug, Default, Zeroable)]
+pub struct PhiNodeEntry<Word> {
+    block_id: Word,
+    val_reg: Word,
 }
 
-impl PhiNodeEntry {
+impl<Word> PhiNodeEntry<Word> {
     /// Create a [`PhiNodeEntry`] from an 2-tuple (matching Python emission format).
     #[must_use]
-    pub const fn from_tuple(t: (u32, u32)) -> Self {
+    pub fn from_tuple(t: (Word, Word)) -> Self {
         Self {
             block_id: t.0,
             val_reg: t.1,
@@ -175,16 +184,16 @@ impl PhiNodeEntry {
 
 /// A switch case.
 #[repr(C)]
-#[derive(Copy, Clone, Debug, Default, Pod, Zeroable)]
-pub struct SwitchCase {
-    case_val: u32,
-    target_block: u32,
+#[derive(Copy, Clone, Debug, Default, Zeroable)]
+pub struct SwitchCase<Word> {
+    case_val: Word,
+    target_block: Word,
 }
 
-impl SwitchCase {
+impl<Word> SwitchCase<Word> {
     /// Create a [`SwitchCase`] from an 2-tuple (matching Python emission format).
     #[must_use]
-    pub const fn from_tuple(t: (u32, u32)) -> Self {
+    pub fn from_tuple(t: (Word, Word)) -> Self {
         Self {
             case_val: t.0,
             target_block: t.1,
@@ -192,50 +201,24 @@ impl SwitchCase {
     }
 }
 
-/// Build a pool of [`Op`] structs from compact `(op_id, q1, q2, q3, angle)` tuples.
-///
-/// Maps each `OpID` integer to the corresponding `Op::new_*` constructor, expanding
-/// the unitary matrix for use on the GPU.
-#[must_use]
-pub fn build_op_pool(compact_ops: &[(u32, u32, u32, u32, f64)]) -> Vec<Op> {
-    compact_ops
-        .iter()
-        .map(|&(op_id, q1, q2, _q3, angle)| {
-            #[allow(clippy::cast_possible_truncation)]
-            let angle_f32 = angle as f32;
-            match op_id {
-                ops::ID => Op::new_id_gate(q1),
-                ops::RESETZ => Op::new_resetz_gate(q1),
-                ops::X => Op::new_x_gate(q1),
-                ops::Y => Op::new_y_gate(q1),
-                ops::Z => Op::new_z_gate(q1),
-                ops::H => Op::new_h_gate(q1),
-                ops::S => Op::new_s_gate(q1),
-                ops::S_ADJ => Op::new_s_adj_gate(q1),
-                ops::T => Op::new_t_gate(q1),
-                ops::T_ADJ => Op::new_t_adj_gate(q1),
-                ops::SX => Op::new_sx_gate(q1),
-                ops::SX_ADJ => Op::new_sx_adj_gate(q1),
-                ops::RX => Op::new_rx_gate(angle_f32, q1),
-                ops::RY => Op::new_ry_gate(angle_f32, q1),
-                ops::RZ => Op::new_rz_gate(angle_f32, q1),
-                ops::CX => Op::new_cx_gate(q1, q2),
-                ops::CY => Op::new_cy_gate(q1, q2),
-                ops::CZ => Op::new_cz_gate(q1, q2),
-                ops::RXX => Op::new_rxx_gate(angle_f32, q1, q2),
-                ops::RYY => Op::new_ryy_gate(angle_f32, q1, q2),
-                ops::RZZ => Op::new_rzz_gate(angle_f32, q1, q2),
-                ops::SWAP => Op::new_swap_gate(q1, q2),
-                ops::MZ => Op::new_mz_gate(q1, q2),
-                ops::MRESETZ => Op::new_mresetz_gate(q1, q2),
-                ops::MOVE => Op::new_move_gate(q1),
-                ops::CORRELATED_NOISE => {
-                    // For adaptive path: q1 = noise_table_idx, q2 = qubit_count.
-                    // Qubit IDs are resolved at runtime from instruction aux fields.
-                    Op::new_2q_gate(ops::CORRELATED_NOISE, q1, q2)
-                }
-                _ => panic!("Unknown op_id in adaptive quantum op pool: {op_id}"),
-            }
-        })
-        .collect()
+#[derive(Debug)]
+pub struct Op<Word> {
+    pub op_id: Word,
+    pub q1: Word,
+    pub q2: Word,
+    pub q3: Word,
+    pub angle: f64,
+}
+
+impl<Word> Op<Word> {
+    #[must_use]
+    pub fn from_tuple(t: (Word, Word, Word, Word, f64)) -> Self {
+        Self {
+            op_id: t.0,
+            q1: t.1,
+            q2: t.2,
+            q3: t.3,
+            angle: t.4,
+        }
+    }
 }
diff --git a/source/simulators/src/gpu_full_state_simulator.rs b/source/simulators/src/gpu_full_state_simulator.rs
index 0bebb02d40..503b7cd434 100644
--- a/source/simulators/src/gpu_full_state_simulator.rs
+++ b/source/simulators/src/gpu_full_state_simulator.rs
@@ -44,7 +44,7 @@ pub fn run_shots_sync(
 }
 
 pub fn run_adaptive_shots_sync(
-    program: AdaptiveProgram,
+    program: AdaptiveProgram<u32>,
     noise: &Option<NoiseConfig<f32, f64>>,
     shot_count: i32,
     rng_seed: u32,
diff --git a/source/simulators/src/gpu_full_state_simulator/gpu_context.rs b/source/simulators/src/gpu_full_state_simulator/gpu_context.rs
index c0d8902a9d..df8ae7191e 100644
--- a/source/simulators/src/gpu_full_state_simulator/gpu_context.rs
+++ b/source/simulators/src/gpu_full_state_simulator/gpu_context.rs
@@ -12,9 +12,10 @@ use crate::gpu_resources::GpuResources;
 use crate::noise_config::NoiseConfig;
 use crate::noise_mapping::get_noise_ops;
 use crate::shader_types::{
-    DiagnosticsData, InterpreterState, MAX_BUFFER_SIZE, MAX_QUBIT_COUNT, MAX_QUBITS_PER_WORKGROUP,
-    MAX_REGISTERS, MAX_SHOT_ENTRIES, MAX_SHOTS_PER_BATCH, MIN_QUBIT_COUNT, MIN_REGISTERS, Op,
-    SIZEOF_SHOTDATA, THREADS_PER_WORKGROUP, Uniforms, WorkgroupCollationBuffer, ops,
+    self, DiagnosticsData, InterpreterState, MAX_BUFFER_SIZE, MAX_QUBIT_COUNT,
+    MAX_QUBITS_PER_WORKGROUP, MAX_REGISTERS, MAX_SHOT_ENTRIES, MAX_SHOTS_PER_BATCH,
+    MIN_QUBIT_COUNT, MIN_REGISTERS, Op, SIZEOF_SHOTDATA, THREADS_PER_WORKGROUP, Uniforms,
+    WorkgroupCollationBuffer, ops,
 };
 
 // On Windows, running larger circuits/shots can hit TDR issues if too many ops are dispatched in one go.
@@ -33,7 +34,7 @@ pub struct GpuContext {
     run_params: RunParams,
 
     // Adaptive program data (set via set_adaptive_program)
-    adaptive_program: Option<AdaptiveProgram>,
+    adaptive_program: Option<AdaptiveProgram<u32>>,
 
     // Indicates if items impacting the Ops have changed and need to be re-uploaded / recompiled
     program_is_dirty: bool,
@@ -539,8 +540,10 @@ impl GpuContext {
         }
     }
 
-    pub fn set_adaptive_program(&mut self, program: AdaptiveProgram) -> Result<(), String> {
+    pub fn set_adaptive_program(&mut self, program: AdaptiveProgram<u32>) -> Result<(), String> {
         self.program.clear();
+        self.program
+            .extend_from_slice(&shader_types::build_op_pool(&program.quantum_ops));
         let num_qubits = u32_to_i32(program.num_qubits);
 
         // Always allocate a minumum number of qubits to ensure good data alignment, GPU thread usage, etc.
@@ -625,8 +628,11 @@ impl GpuContext {
             .copy_from_slice(&program_bytes);
 
         self.resources.upload_batch_data(&batch_data)?;
-        self.resources
-            .upload_ops_data(cast_slice(&program.quantum_ops))?;
+        if let Some(program) = &self.program_with_noise {
+            self.resources.upload_ops_data(cast_slice(program))?;
+        } else {
+            self.resources.upload_ops_data(cast_slice(&self.program))?;
+        }
 
         Ok(())
     }
@@ -665,7 +671,7 @@ impl GpuContext {
                     .adaptive_program
                     .as_mut()
                     .ok_or("No adaptive program has been set")?;
-                let (noisy_ops, index_map) = add_noise_to_adaptive_ops(&program.quantum_ops, noise);
+                let (noisy_ops, index_map) = add_noise_to_adaptive_ops(&self.program, noise);
                 // Patch bytecode instructions that reference quantum op indices.
                 // OP_QUANTUM_GATE (0x10), OP_MEASURE (0x11), OP_RESET (0x12)
                 // all store the op pool index in `aux0`.
@@ -675,7 +681,7 @@ impl GpuContext {
                         instr.aux0 = index_map[instr.aux0 as usize];
                     }
                 }
-                program.quantum_ops = noisy_ops;
+                self.program_with_noise = Some(noisy_ops);
             }
             // Upload the combined batch_data buffer (noise + program) to binding 7
             self.upload_batch_data()?;
diff --git a/source/simulators/src/gpu_full_state_simulator/shader_types.rs b/source/simulators/src/gpu_full_state_simulator/shader_types.rs
index c63d64c1dd..5a45c7c0ed 100644
--- a/source/simulators/src/gpu_full_state_simulator/shader_types.rs
+++ b/source/simulators/src/gpu_full_state_simulator/shader_types.rs
@@ -3,6 +3,7 @@
 
 use std::f32::consts::FRAC_1_SQRT_2;
 
+use crate::bytecode;
 use bytemuck::{Pod, Zeroable};
 
 // ********** Constants used by the GPU shader code and structures *********
@@ -1094,3 +1095,59 @@ pub struct InterpreterState {
 // Total struct size = 64 u32 = 256 bytes (which is aligned to 128 bytes)
 // safety check to make sure Op is the correct size with padding at compile time
 const _: () = assert!(std::mem::size_of::<InterpreterState>() == 256);
+
+/// Build a pool of [`Op`] structs from compact `(op_id, q1, q2, q3, angle)` tuples.
+///
+/// Maps each `OpID` integer to the corresponding `Op::new_*` constructor, expanding
+/// the unitary matrix for use on the GPU.
+#[must_use]
+pub fn build_op_pool(compact_ops: &[bytecode::Op<u32>]) -> Vec<Op> {
+    compact_ops
+        .iter()
+        .map(
+            |&bytecode::Op {
+                 op_id,
+                 q1,
+                 q2,
+                 q3: _,
+                 angle,
+             }| {
+                #[allow(clippy::cast_possible_truncation)]
+                let angle_f32 = angle as f32;
+                match op_id {
+                    ops::ID => Op::new_id_gate(q1),
+                    ops::RESETZ => Op::new_resetz_gate(q1),
+                    ops::X => Op::new_x_gate(q1),
+                    ops::Y => Op::new_y_gate(q1),
+                    ops::Z => Op::new_z_gate(q1),
+                    ops::H => Op::new_h_gate(q1),
+                    ops::S => Op::new_s_gate(q1),
+                    ops::S_ADJ => Op::new_s_adj_gate(q1),
+                    ops::T => Op::new_t_gate(q1),
+                    ops::T_ADJ => Op::new_t_adj_gate(q1),
+                    ops::SX => Op::new_sx_gate(q1),
+                    ops::SX_ADJ => Op::new_sx_adj_gate(q1),
+                    ops::RX => Op::new_rx_gate(angle_f32, q1),
+                    ops::RY => Op::new_ry_gate(angle_f32, q1),
+                    ops::RZ => Op::new_rz_gate(angle_f32, q1),
+                    ops::CX => Op::new_cx_gate(q1, q2),
+                    ops::CY => Op::new_cy_gate(q1, q2),
+                    ops::CZ => Op::new_cz_gate(q1, q2),
+                    ops::RXX => Op::new_rxx_gate(angle_f32, q1, q2),
+                    ops::RYY => Op::new_ryy_gate(angle_f32, q1, q2),
+                    ops::RZZ => Op::new_rzz_gate(angle_f32, q1, q2),
+                    ops::SWAP => Op::new_swap_gate(q1, q2),
+                    ops::MZ => Op::new_mz_gate(q1, q2),
+                    ops::MRESETZ => Op::new_mresetz_gate(q1, q2),
+                    ops::MOVE => Op::new_move_gate(q1),
+                    ops::CORRELATED_NOISE => {
+                        // For adaptive path: q1 = noise_table_idx, q2 = qubit_count.
+                        // Qubit IDs are resolved at runtime from instruction aux fields.
+                        Op::new_2q_gate(ops::CORRELATED_NOISE, q1, q2)
+                    }
+                    _ => panic!("Unknown op_id in adaptive quantum op pool: {op_id}"),
+                }
+            },
+        )
+        .collect()
+}

From 18ed36d75651560f1922fdd45e52946182b65d46 Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Wed, 1 Apr 2026 15:21:05 -0700
Subject: [PATCH 02/14] [wip] add cpu bytecode interpreter

---
 source/pip/qsharp/_adaptive_pass.py           |   12 +-
 source/pip/qsharp/_native.pyi                 |   36 +
 source/pip/qsharp/_simulation.py              |  125 +-
 source/pip/src/interpreter.rs                 |    4 +-
 source/pip/src/qir_simulation.rs              |    2 +-
 .../pip/src/qir_simulation/cpu_simulators.rs  |  160 +-
 .../pip/tests/test_adaptive_cpu_bytecode.py   | 1580 +++++++++++++++++
 source/pip/tests/test_adaptive_cpu_noise.py   |  410 +++++
 .../tests/test_adaptive_cpu_quantum_ops.py    |  373 ++++
 source/pip/tests/test_adaptive_gpu_noise.py   |    2 +-
 source/pip/tests/test_clifford_simulator.py   |   12 +-
 source/simulators/src/bytecode.rs             |   10 +-
 source/simulators/src/bytecode/runtime.rs     |  690 +++++++
 13 files changed, 3345 insertions(+), 71 deletions(-)
 create mode 100644 source/pip/tests/test_adaptive_cpu_bytecode.py
 create mode 100644 source/pip/tests/test_adaptive_cpu_noise.py
 create mode 100644 source/pip/tests/test_adaptive_cpu_quantum_ops.py
 create mode 100644 source/simulators/src/bytecode/runtime.rs

diff --git a/source/pip/qsharp/_adaptive_pass.py b/source/pip/qsharp/_adaptive_pass.py
index d7f11fdeaa..1aca506831 100644
--- a/source/pip/qsharp/_adaptive_pass.py
+++ b/source/pip/qsharp/_adaptive_pass.py
@@ -200,11 +200,14 @@ class SwitchCase:
 @dataclass
 class IntOperand:
     val: int = 0
+    bits: int = 32
 
     def __post_init__(self):
-        # Mask to u32 range so negative Python ints become their
-        # two's-complement u32 representation (e.g. -7 → 0xFFFFFFF9).
-        self.val = self.val & 0xFFFFFFFF
+        # Mask to the appropriate word-width so negative Python ints become
+        # their two's-complement representation
+        # (e.g. -7 → 0xFFFFFFF9 for 32-bit, 0xFFFFFFFFFFFFFFF9 for 64-bit).
+        mask = (1 << self.bits) - 1
+        self.val = self.val & mask
 
 
 class FloatOperand:
@@ -294,6 +297,7 @@ def __init__(self, bytecode_kind: Bytecode):
         self._func_to_id: Dict[str, int] = {}  # function name → function ID
         self._current_func_is_entry: bool = True
         self._noise_intrinsics: Optional[Dict[str, int]] = None
+        self._int_bits = 32 if bytecode_kind == Bytecode.Bit32 else 64
 
     def run(
         self,
@@ -436,7 +440,7 @@ def _resolve_operand(self, value: pyqir.Value) -> IntOperand | FloatOperand | Re
 
         if isinstance(value, pyqir.IntConstant):
             val = value.value
-            return IntOperand(val)
+            return IntOperand(val, self._int_bits)
 
         if isinstance(value, pyqir.FloatConstant):
             val = value.value
diff --git a/source/pip/qsharp/_native.pyi b/source/pip/qsharp/_native.pyi
index 1463ac5e6b..2d059e83ad 100644
--- a/source/pip/qsharp/_native.pyi
+++ b/source/pip/qsharp/_native.pyi
@@ -1005,6 +1005,42 @@ def run_cpu_full_state(
     """
     ...
 
+def run_cpu_adaptive(
+    input: dict,
+    shots: int,
+    noise: Optional[NoiseConfig] = None,
+    seed: Optional[int] = None,
+) -> List[str]:
+    """
+    Run an adaptive profile QIR program on a CPU full-state simulator.
+
+    The input is an `AdaptiveProgram` converted to a dict using the
+    .as_dict() method. Uses 64-bit bytecode for full LLVM i64 semantics.
+
+    Returns a list of result strings. Each result string is composed
+    of '0's, '1's, and 'L's, representing if each measurement result
+    was a Zero, One, or Loss respectively.
+    """
+    ...
+
+def run_clifford_adaptive(
+    input: dict,
+    shots: int,
+    noise: Optional[NoiseConfig] = None,
+    seed: Optional[int] = None,
+) -> List[str]:
+    """
+    Run an adaptive profile QIR program on a Clifford stabilizer simulator.
+
+    The input is an `AdaptiveProgram` converted to a dict using the
+    .as_dict() method. Uses 64-bit bytecode for full LLVM i64 semantics.
+
+    Returns a list of result strings. Each result string is composed
+    of '0's, '1's, and 'L's, representing if each measurement result
+    was a Zero, One, or Loss respectively.
+    """
+    ...
+
 def try_create_gpu_adapter() -> str:
     """
     Checks if a compatible GPU adapter is available on the system.
diff --git a/source/pip/qsharp/_simulation.py b/source/pip/qsharp/_simulation.py
index f237e46278..7a3aae40af 100644
--- a/source/pip/qsharp/_simulation.py
+++ b/source/pip/qsharp/_simulation.py
@@ -9,8 +9,10 @@
     QirInstructionId,
     QirInstruction,
     run_clifford,
+    run_clifford_adaptive,
     run_parallel_shots,
     run_adaptive_parallel_shots,
+    run_cpu_adaptive,
     run_cpu_full_state,
     NoiseConfig,
     GpuContext,
@@ -25,7 +27,12 @@
 )
 from ._qsharp import QirInputData, Result
 from typing import TYPE_CHECKING
-from ._adaptive_pass import AdaptiveProfilePass, Bytecode, OP_RECORD_OUTPUT
+from ._adaptive_pass import (
+    AdaptiveProfilePass,
+    AdaptiveProgram,
+    Bytecode,
+    OP_RECORD_OUTPUT,
+)
 
 if TYPE_CHECKING:  # This is in the pyi file only
     from ._native import GpuShotResults
@@ -485,6 +492,43 @@ def is_adaptive(mod: pyqir.Module) -> bool:
     return func_attrs["qir_profiles"].string_value == "adaptive_profile"
 
 
+def str_to_result(result: str):
+    match result:
+        case "0":
+            return Result.Zero
+        case "1":
+            return Result.One
+        case "L":
+            return Result.Loss
+        case _:
+            raise ValueError(f"Invalid result {result}")
+
+
+def run_adaptive(
+    rust_run_adaptive_fn: Callable,
+    program: AdaptiveProgram,
+    shots: int,
+    noise: Optional[NoiseConfig],
+    seed: int,
+):
+    """
+    Runs an adaptive program given a rust simulator. Adds output recording logic.
+    """
+    results = rust_run_adaptive_fn(program.as_dict(), shots, noise, seed)
+    # Extract recorded output result indices from the bytecode.
+    # OP_RECORD_OUTPUT with aux1=0 is result_record_output where
+    # src0 is the result index in the results buffer.
+    recorded_result_indices = []
+    for ins in program.instructions:
+        if (ins.opcode & 0xFF) == OP_RECORD_OUTPUT and ins.aux1 == 0:
+            recorded_result_indices.append(ins.src0)
+    # Filter shot_results to only include recorded output indices
+    filtered = []
+    for s in results:
+        filtered.append([str_to_result(s[i]) for i in recorded_result_indices])
+    return filtered
+
+
 def run_qir_clifford(
     input: Union[QirInputData, str, bytes],
     shots: Optional[int] = 1,
@@ -492,19 +536,23 @@ def run_qir_clifford(
     seed: Optional[int] = None,
 ) -> List:
     (mod, shots, noise, seed) = preprocess_simulation_input(input, shots, noise, seed)
-    if noise is None:
-        (gates, num_qubits, num_results) = AggregateGatesPass().run(mod)
+    if is_adaptive(mod):
+        program = AdaptiveProfilePass(Bytecode.Bit64).run(mod, noise)
+        return run_adaptive(run_clifford_adaptive, program, shots, noise, seed)
     else:
-        (gates, num_qubits, num_results) = CorrelatedNoisePass(noise).run(mod)
-    recorder = OutputRecordingPass()
-    recorder.run(mod)
-
-    return list(
-        map(
-            recorder.process_output,
-            run_clifford(gates, num_qubits, num_results, shots, noise, seed),
+        if noise is None:
+            (gates, num_qubits, num_results) = AggregateGatesPass().run(mod)
+        else:
+            (gates, num_qubits, num_results) = CorrelatedNoisePass(noise).run(mod)
+        recorder = OutputRecordingPass()
+        recorder.run(mod)
+
+        return list(
+            map(
+                recorder.process_output,
+                run_clifford(gates, num_qubits, num_results, shots, noise, seed),
+            )
         )
-    )
 
 
 def run_qir_cpu(
@@ -514,31 +562,23 @@ def run_qir_cpu(
     seed: Optional[int] = None,
 ) -> List:
     (mod, shots, noise, seed) = preprocess_simulation_input(input, shots, noise, seed)
-    if noise is None:
-        (gates, num_qubits, num_results) = AggregateGatesPass().run(mod)
+    if is_adaptive(mod):
+        program = AdaptiveProfilePass(Bytecode.Bit64).run(mod, noise)
+        return run_adaptive(run_cpu_adaptive, program, shots, noise, seed)
     else:
-        (gates, num_qubits, num_results) = CorrelatedNoisePass(noise).run(mod)
-    recorder = OutputRecordingPass()
-    recorder.run(mod)
-
-    return list(
-        map(
-            recorder.process_output,
-            run_cpu_full_state(gates, num_qubits, num_results, shots, noise, seed),
-        )
-    )
-
+        if noise is None:
+            (gates, num_qubits, num_results) = AggregateGatesPass().run(mod)
+        else:
+            (gates, num_qubits, num_results) = CorrelatedNoisePass(noise).run(mod)
+        recorder = OutputRecordingPass()
+        recorder.run(mod)
 
-def str_to_result(result: str):
-    match result:
-        case "0":
-            return Result.Zero
-        case "1":
-            return Result.One
-        case "L":
-            return Result.Loss
-        case _:
-            raise ValueError(f"Invalid result {result}")
+        return list(
+            map(
+                recorder.process_output,
+                run_cpu_full_state(gates, num_qubits, num_results, shots, noise, seed),
+            )
+        )
 
 
 def run_qir_gpu(
@@ -552,20 +592,7 @@ def run_qir_gpu(
     DecomposeCcxPass().run(mod)
     if is_adaptive(mod):
         program = AdaptiveProfilePass(Bytecode.Bit32).run(mod, noise)
-        results = run_adaptive_parallel_shots(program.as_dict(), shots, noise, seed)
-
-        # Extract recorded output result indices from the bytecode.
-        # OP_RECORD_OUTPUT with aux1=0 is result_record_output where
-        # src0 is the result index in the results buffer.
-        recorded_result_indices = []
-        for ins in program.instructions:
-            if (ins.opcode & 0xFF) == OP_RECORD_OUTPUT and ins.aux1 == 0:
-                recorded_result_indices.append(ins.src0)
-        # Filter shot_results to only include recorded output indices
-        filtered = []
-        for s in results:
-            filtered.append([str_to_result(s[i]) for i in recorded_result_indices])
-        return filtered
+        return run_adaptive(run_adaptive_parallel_shots, program, shots, noise, seed)
     else:
         if noise is None:
             (gates, num_qubits, num_results) = AggregateGatesPass().run(mod)
diff --git a/source/pip/src/interpreter.rs b/source/pip/src/interpreter.rs
index ea631ebd23..e488bf3b88 100644
--- a/source/pip/src/interpreter.rs
+++ b/source/pip/src/interpreter.rs
@@ -24,7 +24,7 @@ use crate::{
     noisy_simulator::register_noisy_simulator_submodule,
     qir_simulation::{
         IdleNoiseParams, NoiseConfig, NoiseTable, QirInstruction, QirInstructionId,
-        cpu_simulators::{run_clifford, run_cpu_full_state},
+        cpu_simulators::{run_clifford, run_clifford_adaptive, run_cpu_adaptive, run_cpu_full_state},
         gpu_full_state::{
             GpuContext, run_adaptive_parallel_shots, run_parallel_shots, try_create_gpu_adapter,
         },
@@ -134,6 +134,8 @@ fn _native<'a>(py: Python<'a>, m: &Bound<'a, PyModule>) -> PyResult<()> {
     m.add_function(wrap_pyfunction!(run_clifford, m)?)?;
     m.add_function(wrap_pyfunction!(try_create_gpu_adapter, m)?)?;
     m.add_function(wrap_pyfunction!(run_cpu_full_state, m)?)?;
+    m.add_function(wrap_pyfunction!(run_cpu_adaptive, m)?)?;
+    m.add_function(wrap_pyfunction!(run_clifford_adaptive, m)?)?;
     m.add_function(wrap_pyfunction!(run_parallel_shots, m)?)?;
     m.add_function(wrap_pyfunction!(run_adaptive_parallel_shots, m)?)?;
     m.add("QSharpError", py.get_type::<QSharpError>())?;
diff --git a/source/pip/src/qir_simulation.rs b/source/pip/src/qir_simulation.rs
index db416cdd41..27f360c92c 100644
--- a/source/pip/src/qir_simulation.rs
+++ b/source/pip/src/qir_simulation.rs
@@ -762,7 +762,7 @@ where
         .extract()
 }
 
-fn adaptive_program_from_pydict<Word>(
+pub(crate) fn adaptive_program_from_pydict<Word>(
     dict: &Bound<'_, PyDict>,
 ) -> PyResult<bytecode::AdaptiveProgram<Word>>
 where
diff --git a/source/pip/src/qir_simulation/cpu_simulators.rs b/source/pip/src/qir_simulation/cpu_simulators.rs
index 45d594ae34..dbcc663ebf 100644
--- a/source/pip/src/qir_simulation/cpu_simulators.rs
+++ b/source/pip/src/qir_simulation/cpu_simulators.rs
@@ -4,11 +4,15 @@
 #[cfg(test)]
 mod tests;
 
-use crate::qir_simulation::{NoiseConfig, QirInstruction, QirInstructionId, unbind_noise_config};
+use crate::qir_simulation::{
+    NoiseConfig, QirInstruction, QirInstructionId, adaptive_program_from_pydict,
+    unbind_noise_config,
+};
 use pyo3::{IntoPyObjectExt, exceptions::PyValueError, prelude::*, types::PyList};
-use pyo3::{PyResult, pyfunction};
+use pyo3::{PyResult, pyfunction, types::PyDict};
 use qdk_simulators::{
     MeasurementResult, Simulator,
+    bytecode::{self, runtime::run_shot as adaptive_run_shot},
     cpu_full_state_simulator::{NoiselessSimulator, NoisySimulator},
     noise_config::{self, CumulativeNoiseConfig},
     stabilizer_simulator::StabilizerSimulator,
@@ -267,3 +271,155 @@ fn run_shot<S: Simulator>(instructions: &[QirInstruction], sim: &mut S) {
         }
     }
 }
+
+// ---------------------------------------------------------------------------
+// Adaptive Profile CPU simulation
+// ---------------------------------------------------------------------------
+
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn run_cpu_adaptive<'py>(
+    py: Python<'py>,
+    input: &Bound<'py, PyDict>,
+    shots: u32,
+    noise_config: Option<&Bound<'py, NoiseConfig>>,
+    seed: Option<u32>,
+) -> PyResult<Py<PyAny>> {
+    use qdk_simulators::cpu_full_state_simulator::noise::Fault;
+
+    let program: bytecode::AdaptiveProgram<u64> = adaptive_program_from_pydict(input)?;
+
+    let noise: noise_config::NoiseConfig<f64, f64> = if let Some(nc) = noise_config {
+        unbind_noise_config(py, nc)
+    } else {
+        noise_config::NoiseConfig::NOISELESS
+    };
+
+    let output = if noise_config.is_some() {
+        let make_simulator =
+            |num_qubits, num_results, seed, noise: Arc<CumulativeNoiseConfig<Fault>>| {
+                NoisySimulator::new(num_qubits, num_results, seed, noise)
+            };
+        run_adaptive(&program, shots, seed, noise, make_simulator)
+    } else {
+        let make_simulator =
+            |num_qubits, num_results, seed, _noise: Arc<CumulativeNoiseConfig<Fault>>| {
+                NoiselessSimulator::new(num_qubits, num_results, seed, ())
+            };
+        run_adaptive(&program, shots, seed, noise, make_simulator)
+    };
+
+    let mut array = Vec::with_capacity(shots as usize);
+    for val in output {
+        array.push(
+            val.into_py_any(py).map_err(|e| {
+                PyValueError::new_err(format!("failed to create Python string: {e}"))
+            })?,
+        );
+    }
+
+    PyList::new(py, array)
+        .map_err(|e| PyValueError::new_err(format!("failed to create Python list: {e}")))?
+        .into_py_any(py)
+}
+
+#[pyfunction]
+#[allow(clippy::too_many_arguments)]
+pub fn run_clifford_adaptive<'py>(
+    py: Python<'py>,
+    input: &Bound<'py, PyDict>,
+    shots: u32,
+    noise_config: Option<&Bound<'py, NoiseConfig>>,
+    seed: Option<u32>,
+) -> PyResult<Py<PyAny>> {
+    use qdk_simulators::stabilizer_simulator::noise::Fault;
+
+    let program: bytecode::AdaptiveProgram<u64> = adaptive_program_from_pydict(input)?;
+
+    let noise: noise_config::NoiseConfig<f64, f64> = if let Some(nc) = noise_config {
+        unbind_noise_config(py, nc)
+    } else {
+        noise_config::NoiseConfig::NOISELESS
+    };
+
+    let make_simulator =
+        |num_qubits, num_results, seed, noise: Arc<CumulativeNoiseConfig<Fault>>| {
+            StabilizerSimulator::new(num_qubits, num_results, seed, noise)
+        };
+    let output = run_adaptive(&program, shots, seed, noise, make_simulator);
+
+    let mut array = Vec::with_capacity(shots as usize);
+    for val in output {
+        array.push(
+            val.into_py_any(py).map_err(|e| {
+                PyValueError::new_err(format!("failed to create Python string: {e}"))
+            })?,
+        );
+    }
+
+    PyList::new(py, array)
+        .map_err(|e| PyValueError::new_err(format!("failed to create Python list: {e}")))?
+        .into_py_any(py)
+}
+
+fn run_adaptive<SimulatorBuilder, Noise, S>(
+    program: &bytecode::AdaptiveProgram<u64>,
+    shots: u32,
+    seed: Option<u32>,
+    mut noise: noise_config::NoiseConfig<f64, f64>,
+    make_simulator: SimulatorBuilder,
+) -> Vec<String>
+where
+    SimulatorBuilder: Fn(usize, usize, u32, Arc<Noise>) -> S + Send + Sync,
+    Noise: From<noise_config::NoiseConfig<f64, f64>> + Send + Sync,
+    S: Simulator,
+{
+    if !noise.rz.is_noiseless() {
+        if noise.s.is_noiseless() {
+            noise.s = noise.rz.clone();
+        }
+        if noise.z.is_noiseless() {
+            noise.z = noise.rz.clone();
+        }
+        if noise.s_adj.is_noiseless() {
+            noise.s_adj = noise.rz.clone();
+        }
+    }
+
+    let noise: Noise = noise.into();
+    let noise = Arc::new(noise);
+
+    let num_qubits = program.num_qubits as usize;
+    let num_results = program.num_results as usize;
+
+    let mut rng = if let Some(seed) = seed {
+        StdRng::seed_from_u64(seed.into())
+    } else {
+        StdRng::from_entropy()
+    };
+
+    let output = (0..shots)
+        .map(|_| rng.r#gen())
+        .collect::<Vec<u32>>()
+        .par_iter()
+        .map(|shot_seed| {
+            let mut simulator = make_simulator(num_qubits, num_results, *shot_seed, noise.clone());
+            adaptive_run_shot(program, &mut simulator);
+            simulator.take_measurements()
+        })
+        .collect::<Vec<_>>();
+
+    let mut values = Vec::with_capacity(shots as usize);
+    for shot_result in output {
+        let mut buffer = String::with_capacity(shot_result.len());
+        for measurement in shot_result {
+            match measurement {
+                MeasurementResult::Zero => write!(&mut buffer, "0").expect("write should succeed"),
+                MeasurementResult::One => write!(&mut buffer, "1").expect("write should succeed"),
+                MeasurementResult::Loss => write!(&mut buffer, "L").expect("write should succeed"),
+            }
+        }
+        values.push(buffer);
+    }
+    values
+}
diff --git a/source/pip/tests/test_adaptive_cpu_bytecode.py b/source/pip/tests/test_adaptive_cpu_bytecode.py
new file mode 100644
index 0000000000..6de5a36da3
--- /dev/null
+++ b/source/pip/tests/test_adaptive_cpu_bytecode.py
@@ -0,0 +1,1580 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Per-opcode tests for the adaptive CPU bytecode interpreter.
+
+Each test targets one (or a small family of) bytecode instruction(s) by
+supplying hand-written Adaptive Profile QIR that exercises the instruction
+and encodes the expected result into a measurement outcome.
+
+Tests are ordered to match the opcode definitions in ``_adaptive_opcodes.py``
+so that coverage can be verified by reading both files side by side.
+
+This is a CPU counterpart to ``test_adaptive_gpu_bytecode.py``.
+"""
+
+from collections import Counter
+import pytest
+import qsharp.openqasm
+
+from qsharp._simulation import run_qir, Result
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+# Deterministic programs need a single shot but we run multiple shots
+# to verify that multiple shots yield the same result.
+SHOTS = 100
+
+SIM_TYPES = ["cpu", "clifford"]
+
+
+def map_result_list_to_str(results):
+    results_str = ""
+    for r in results:
+        match r:
+            case Result.Zero:
+                results_str += "0"
+            case Result.One:
+                results_str += "1"
+            case Result.Loss:
+                results_str += "L"
+    return results_str
+
+
+def _run(qir: str, shots: int = SHOTS, seed: int = 42, sim_type: str = "cpu"):
+    """Run *qir* on the given simulator and return shot results as a list of strings."""
+    results = run_qir(qir, shots, seed=seed, type=sim_type)
+    return [map_result_list_to_str(r) for r in results]
+
+
+def check_result(
+    qir_fragment: str,
+    expected: str,
+    *,
+    extra_decls: str = "",
+    num_qubits: int = 1,
+    num_results: int = 1,
+    record=None,
+    sim_type: str = "cpu",
+):
+    """Assert every shot produces *expected*."""
+    qir = format_qir(
+        qir_fragment,
+        extra_decls=extra_decls,
+        num_qubits=num_qubits,
+        num_results=num_results,
+        record=record,
+    )
+    results = _run(qir, SHOTS, sim_type=sim_type)
+    counts = Counter(results)
+    assert counts == {
+        expected: SHOTS
+    }, f"Expected all {SHOTS} shots to be '{expected}', got {counts}"
+
+
+def check_arith_result(qir_fragment: str, expected: str, sim_type: str = "cpu"):
+    body = build_arith_body(qir_fragment)
+    check_result(body, expected, sim_type=sim_type)
+
+
+_DECLS = """\
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*)
+declare void @__quantum__qis__mz__body(%Qubit*, %Result*) #1
+declare void @__quantum__qis__reset__body(%Qubit*)
+declare void @__quantum__qis__cnot__body(%Qubit*, %Qubit*)
+declare void @__quantum__qis__z__body(%Qubit*)
+declare void @__quantum__qis__s__body(%Qubit*)
+declare void @__quantum__qis__t__body(%Qubit*)
+declare void @__quantum__qis__cz__body(%Qubit*, %Qubit*)
+declare void @__quantum__qis__rz__body(double, %Qubit*)
+declare i1 @__quantum__qis__read_result__body(%Result*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+declare void @__quantum__rt__initialize(i8*)
+"""
+
+
+def format_qir(
+    body: str,
+    *,
+    extra_decls: str = "",
+    num_qubits: int = 1,
+    num_results: int = 1,
+    record=None,
+):
+    if record is None:
+        record = range(num_results)
+    output_recording = (
+        f"  call void @__quantum__rt__tuple_record_output(i64 {len(record)}, i8* null)"
+    )
+    for result_id in record:
+        output_recording += f"\n  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 {result_id} to %Result*), i8* null)"
+
+    return f"""\
+%Result = type opaque
+%Qubit = type opaque
+
+define i64 @ENTRYPOINT__main() #0 {{
+{body}
+{output_recording}
+  ret i64 0
+}}
+
+{_DECLS}
+{extra_decls}
+attributes #0 = {{ "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="{num_qubits}" "required_num_results"="{num_results}" }}
+attributes #1 = {{ "irreversible" }}
+"""
+
+
+def build_arith_body(
+    arith_fragment: str,
+):
+    """Builds the body for a QIR module that does classical work and
+    then conditionally applies X to qubit 0 before measuring into result 0.
+
+    *arith_fragment* should produce ``%flag`` (i1) which, when true, causes X.
+    The measurement of qubit 0 into result 0 is the observable.
+    """
+    return f"""\
+entry:
+{arith_fragment}
+  br i1 %flag, label %then, label %end
+then:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %end
+end:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+# #########################################################################
+#  Control Flow
+# #########################################################################
+
+
+# =========================================================================
+# OP_NOP — no-op
+# =========================================================================
+
+NOP_SMOKE_QIR = """
+entry:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_nop_smoke(sim_type):
+    """Minimal program: just measure |0⟩ → always 0."""
+    check_result(NOP_SMOKE_QIR, "0", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_RET — return / program termination
+# =========================================================================
+
+RET_QIR = """
+entry:
+  ret i64 0
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_ret(sim_type):
+    check_result(RET_QIR, "0", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_JUMP — unconditional jump
+# =========================================================================
+
+JUMP_QIR = """
+entry:
+  br label %target
+  ret i64 0  ; early return - unreachable
+target:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_jump(sim_type):
+    """Unconditional jump lands at target block, X applied → measure 1."""
+    check_result(JUMP_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_BRANCH — conditional branch
+# =========================================================================
+
+BRANCH_TRUE_QIR = """
+entry:
+  %c = icmp eq i64 1, 1
+  br i1 %c, label %yes, label %no
+  ret i64 0  ; early return - unreachable
+yes:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+no:
+  br label %measure
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+BRANCH_FALSE_QIR = """
+entry:
+  %c = icmp eq i64 1, 2
+  br i1 %c, label %yes, label %no
+  ret i64 0  ; early return - unreachable
+yes:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+no:
+  br label %measure
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_branch_true(sim_type):
+    check_result(BRANCH_TRUE_QIR, "1", sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_branch_false(sim_type):
+    check_result(BRANCH_FALSE_QIR, "0", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_SWITCH — switch dispatch
+# =========================================================================
+
+SWITCH_CASE1_QIR = """
+entry:
+  %val = add i64 0, 1
+  switch i64 %val, label %default [
+    i64 0, label %case0
+    i64 1, label %case1
+    i64 2, label %case2
+  ]
+case0:
+  br label %measure
+case1:
+  ; This is the expected path for val==1
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+case2:
+  br label %measure
+default:
+  br label %measure
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+SWITCH_DEFAULT_QIR = """
+entry:
+  %val = add i64 0, 99
+  switch i64 %val, label %default [
+    i64 0, label %case0
+    i64 1, label %case1
+  ]
+case0:
+  br label %measure
+case1:
+  br label %measure
+default:
+  ; val=99 takes default path → X applied
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_switch_case(sim_type):
+    check_result(SWITCH_CASE1_QIR, "1", sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_switch_default(sim_type):
+    check_result(SWITCH_DEFAULT_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_CALL / OP_CALL_RETURN — function calls
+# =========================================================================
+
+CALL_QIR = """
+entry:
+  call void @apply_x(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+CALL_QIR_FN = """
+define void @apply_x(%Qubit* %q) {
+entry:
+  call void @__quantum__qis__x__body(%Qubit* %q)
+  ret void
+}
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_call_and_return(sim_type):
+    """Call a helper function that applies X, then measure."""
+    check_result(CALL_QIR, "1", extra_decls=CALL_QIR_FN, sim_type=sim_type)
+
+
+# #########################################################################
+#  Quantum
+# #########################################################################
+
+
+# =========================================================================
+# OP_QUANTUM_GATE — single and two-qubit gates
+# =========================================================================
+
+GATE_X_QIR = """
+entry:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+GATE_CNOT_QIR = """
+entry:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__cnot__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_gate_x(sim_type):
+    check_result(GATE_X_QIR, "1", sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_gate_cnot(sim_type):
+    check_result(GATE_CNOT_QIR, "1", num_qubits=2, sim_type=sim_type)
+
+
+# =========================================================================
+# OP_MEASURE — measurement (also see OP_READ_RESULT below)
+# =========================================================================
+
+MZ_THEN_RESET_QIR = """
+entry:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  ; After mz, qubit should still be |1⟩
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__qis__reset__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  ; After reset, qubit should be |0⟩
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_mz_then_reset(sim_type):
+    "X → MZ → MZ → reset should give 110."
+    check_result(MZ_THEN_RESET_QIR, "110", num_results=3, sim_type=sim_type)
+
+
+# =========================================================================
+# OP_RESET — qubit reset
+# =========================================================================
+
+RESET_QIR = """
+entry:
+  ; Put qubit 0 in |1⟩
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  ; Reset it back to |0⟩
+  call void @__quantum__qis__reset__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  ; Measure — should be 0
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_reset(sim_type):
+    """X → reset → measure should give 0."""
+    check_result(RESET_QIR, "0", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_READ_RESULT + OP_MEASURE — read measurement results
+# =========================================================================
+
+READ_RESULT_QIR = """
+entry:
+  ; Prepare |1⟩ on qubit 0 via X
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  ; Measure qubit 0 → should always be 1
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  ; Read back the result
+  %r = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 0 to %Result*))
+  ; If result was 1, apply X again so qubit is back in |1⟩ for second measurement
+  br i1 %r, label %then, label %end
+
+then:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %end
+
+end:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_read_result(sim_type):
+    """X → MResetZ → read_result → if 1: X again → MResetZ.
+    First result is always 1, read_result sees it, applies X, second result is also 1.
+    """
+    check_result(READ_RESULT_QIR, "11", num_results=2, sim_type=sim_type)
+
+
+# =========================================================================
+# OP_RECORD_OUTPUT — output recording
+# =========================================================================
+
+RECORD_OUTPUT_QIR = """
+entry:
+  ; q0 = |1⟩, q1 = |0⟩
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_record_output_ordering(sim_type):
+    """Two results recorded: result0=1, result1=0 → '10'."""
+    check_result(RECORD_OUTPUT_QIR, "10", num_qubits=2, num_results=2, sim_type=sim_type)
+
+
+# #########################################################################
+#  Integer Arithmetic
+# #########################################################################
+
+INT_ARITH_PARAMS = [
+    # Int
+    ("add", 3, 4, 7),
+    ("sub", 10, 3, 7),
+    ("sub", 3, 10, -7),
+    ("mul", 6, 7, 42),
+    ("udiv", 42, 7, 6),
+    ("sdiv", -42, 7, -6),
+    ("urem", 10, 3, 1),
+    ("srem", -10, 3, -1),
+    # Bitwise
+    ("and", 255, 15, 15),
+    ("or", 240, 15, 255),
+    ("xor", 255, 15, 240),
+    ("shl", 1, 3, 8),
+    ("lshr", 32, 2, 8),
+    ("ashr", -16, 2, -4),
+]
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    INT_ARITH_PARAMS,
+)
+def test_int_arith_imm_imm(sim_type, bin_op, lhs, rhs, expected):
+    check_arith_result(
+        f"""
+        %a = {bin_op} i64 {lhs}, {rhs}
+        %flag = icmp eq i64 %a, {expected}""",
+        "1",
+        sim_type=sim_type,
+    )
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    INT_ARITH_PARAMS,
+)
+def test_int_arith_imm_reg(sim_type, bin_op, lhs, rhs, expected):
+    check_arith_result(
+        f"""
+        %rhs = add i64 {rhs}, 0
+        %a = {bin_op} i64 {lhs}, %rhs
+        %flag = icmp eq i64 %a, {expected}""",
+        "1",
+        sim_type=sim_type,
+    )
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    INT_ARITH_PARAMS,
+)
+def test_int_arith_reg_imm(sim_type, bin_op, lhs, rhs, expected):
+    check_arith_result(
+        f"""
+        %lhs = add i64 {lhs}, 0
+        %a = {bin_op} i64 %lhs, {rhs}
+        %flag = icmp eq i64 %a, {expected}""",
+        "1",
+        sim_type=sim_type,
+    )
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    INT_ARITH_PARAMS,
+)
+def test_int_arith_reg_reg(sim_type, bin_op, lhs, rhs, expected):
+    check_arith_result(
+        f"""
+        %lhs = add i64 {lhs}, 0
+        %rhs = add i64 {rhs}, 0
+        %a = {bin_op} i64 %lhs, %rhs
+        %flag = icmp eq i64 %a, {expected}""",
+        "1",
+        sim_type=sim_type,
+    )
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    INT_ARITH_PARAMS,
+)
+def test_int_arith_negative_test(sim_type, bin_op, lhs, rhs, expected):
+    """Checks that the tests fail if the result is different from the expected value."""
+    expected = 12345
+    check_arith_result(
+        f"""
+        %a = {bin_op} i64 {lhs}, {rhs}
+        %flag = icmp eq i64 %a, {expected}""",
+        "0",
+        sim_type=sim_type,
+    )
+
+
+# #########################################################################
+#  Comparison  (OP_ICMP, OP_FCMP)
+# #########################################################################
+
+
+# =========================================================================
+# OP_ICMP — integer comparison (all condition codes)
+# =========================================================================
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "pred,lhs,rhs,expected",
+    [
+        ("eq", 2, 2, "1"),
+        ("eq", 2, 3, "0"),
+        ("ne", 2, 3, "1"),
+        ("ne", 2, 2, "0"),
+        ("slt", 2, 3, "1"),
+        ("slt", 2, 2, "0"),
+        ("sle", 2, 2, "1"),
+        ("sle", 3, 2, "0"),
+        ("sgt", 3, 2, "1"),
+        ("sgt", 2, 3, "0"),
+        ("sge", 3, 3, "1"),
+        ("sge", 2, 3, "0"),
+        ("ult", 2, 3, "1"),
+        ("ult", 3, 2, "0"),
+        ("ule", 3, 3, "1"),
+        ("ule", 3, 2, "0"),
+        ("ugt", 3, 2, "1"),
+        ("ugt", 2, 3, "0"),
+        ("uge", 3, 3, "1"),
+        ("uge", 2, 3, "0"),
+    ],
+)
+def test_icmp(sim_type, pred, lhs, rhs, expected):
+    check_arith_result(
+        f"%flag = icmp {pred} i64 {lhs}, {rhs}",
+        expected,
+        sim_type=sim_type,
+    )
+
+
+# =========================================================================
+# OP_ICMP — signed vs unsigned edge case (negative as unsigned)
+# =========================================================================
+
+ICMP_SIGNED_VS_UNSIGNED_QIR = """
+  ; -1 in two's complement is 0xFFFFFFFFFFFFFFFF, which is the max u64
+  ; signed: -1 < 0 → true
+  %neg1 = sub i64 0, 1
+  %flag = icmp slt i64 %neg1, 0
+"""
+
+ICMP_UNSIGNED_WRAP_QIR = """
+  ; unsigned: -1 wraps to max u64, so -1 > 0 → true (unsigned)
+  %neg1 = sub i64 0, 1
+  %flag = icmp ugt i64 %neg1, 0
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_icmp_signed_negative(sim_type):
+    check_arith_result(ICMP_SIGNED_VS_UNSIGNED_QIR, "1", sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_icmp_unsigned_wrap(sim_type):
+    check_arith_result(ICMP_UNSIGNED_WRAP_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_FCMP — float comparison
+# =========================================================================
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "pred,lhs,rhs,expected",
+    [
+        ("oeq", "3.0", "3.0", "1"),
+        ("oeq", "3.0", "4.0", "0"),
+        ("one", "3.0", "4.0", "1"),
+        ("one", "3.0", "3.0", "0"),
+        ("olt", "2.0", "3.0", "1"),
+        ("olt", "3.0", "2.0", "0"),
+        ("ole", "3.0", "3.0", "1"),
+        ("ole", "4.0", "3.0", "0"),
+        ("ogt", "4.0", "3.0", "1"),
+        ("ogt", "3.0", "4.0", "0"),
+        ("oge", "3.0", "3.0", "1"),
+        ("oge", "2.0", "3.0", "0"),
+    ],
+)
+def test_fcmp(sim_type, pred, lhs, rhs, expected):
+    check_arith_result(
+        f"%flag = fcmp {pred} double {lhs}, {rhs}",
+        expected,
+        sim_type=sim_type,
+    )
+
+
+# #########################################################################
+#  Float Arithmetic  (OP_FADD → OP_FDIV)
+# #########################################################################
+
+FLOAT_ARITH_PARAMS = [
+    ("fadd", 1.5, 2.5, 4.0),
+    ("fsub", 10.0, 3.0, 7.0),
+    ("fsub", 3.0, 10.0, -7.0),
+    ("fmul", 6.0, 7.0, 42.0),
+    ("fdiv", 8.0, 2.0, 4.0),
+]
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    FLOAT_ARITH_PARAMS,
+)
+def test_float_arith_imm_imm(sim_type, bin_op, lhs, rhs, expected):
+    check_arith_result(
+        f"""
+        %a = {bin_op} double {lhs}, {rhs}
+        %flag = fcmp oeq double %a, {expected}""",
+        "1",
+        sim_type=sim_type,
+    )
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    FLOAT_ARITH_PARAMS,
+)
+def test_float_arith_imm_reg(sim_type, bin_op, lhs, rhs, expected):
+    check_arith_result(
+        f"""
+        %rhs = fadd double {rhs}, 0.0
+        %a = {bin_op} double {lhs}, %rhs
+        %flag = fcmp oeq double %a, {expected}""",
+        "1",
+        sim_type=sim_type,
+    )
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    FLOAT_ARITH_PARAMS,
+)
+def test_float_arith_reg_imm(sim_type, bin_op, lhs, rhs, expected):
+    check_arith_result(
+        f"""
+        %lhs = fadd double {lhs}, 0.0
+        %a = {bin_op} double %lhs, {rhs}
+        %flag = fcmp oeq double %a, {expected}""",
+        "1",
+        sim_type=sim_type,
+    )
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    FLOAT_ARITH_PARAMS,
+)
+def test_float_arith_reg_reg(sim_type, bin_op, lhs, rhs, expected):
+    check_arith_result(
+        f"""
+        %lhs = fadd double {lhs}, 0.0
+        %rhs = fadd double {rhs}, 0.0
+        %a = {bin_op} double %lhs, %rhs
+        %flag = fcmp oeq double %a, {expected}""",
+        "1",
+        sim_type=sim_type,
+    )
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+@pytest.mark.parametrize(
+    "bin_op,lhs,rhs,expected",
+    FLOAT_ARITH_PARAMS,
+)
+def test_float_arith_negative_test(sim_type, bin_op, lhs, rhs, expected):
+    """Checks that the tests fail if the result is different from the expected value."""
+    expected = 12345.0
+    check_arith_result(
+        f"""
+        %a = {bin_op} double {lhs}, {rhs}
+        %flag = fcmp oeq double %a, {expected}""",
+        "0",
+        sim_type=sim_type,
+    )
+
+
+# #########################################################################
+#  Type Conversion  (OP_ZEXT → OP_SITOFP)
+# #########################################################################
+
+
+# =========================================================================
+# OP_ZEXT — zero extension
+# =========================================================================
+
+ZEXT_QIR = """
+  ; zext i1 true to i64 → 1, check 1 == 1 → true
+  %z = zext i1 true to i64
+  %flag = icmp eq i64 %z, 1
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_zext(sim_type):
+    check_arith_result(ZEXT_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_SEXT — sign extension
+# =========================================================================
+
+SEXT_QIR = """
+  ; sext i1 true to i64 → -1 (all ones), check -1 < 0 → true
+  %s = sext i1 true to i64
+  %flag = icmp slt i64 %s, 0
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_sext(sim_type):
+    check_arith_result(SEXT_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_TRUNC — truncation
+# =========================================================================
+
+TRUNC_QIR = """
+  ; trunc i64 257 to i32 → 257 (fits), check 257 == 257 → true
+  %t = trunc i64 257 to i32
+  %z = zext i32 %t to i64
+  %flag = icmp eq i64 %z, 257
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_trunc(sim_type):
+    check_arith_result(TRUNC_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_FPEXT / OP_FPTRUNC — float extension/truncation
+# =========================================================================
+
+FPEXT_QIR = """
+  ; fpext float 3.0 to double, then check == 3
+  %f32 = fadd float 1.0, 2.0
+  %f64 = fpext float %f32 to double
+  %i = fptosi double %f64 to i64
+  %flag = icmp eq i64 %i, 3
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_fpext(sim_type):
+    check_arith_result(FPEXT_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_INTTOPTR / OP_MOV — dynamic qubit addressing
+# =========================================================================
+
+INTTOPTR_QIR = """
+entry:
+  ; Compute qubit ID 0 from arithmetic
+  %q_id = sub i64 1, 1
+  %q = inttoptr i64 %q_id to %Qubit*
+  call void @__quantum__qis__x__body(%Qubit* %q)
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_inttoptr_dynamic_qubit(sim_type):
+    check_result(INTTOPTR_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_FPTOSI — float to signed int
+# =========================================================================
+
+FPTOSI_QIR = """
+  ; fptosi -3.7 → -3 (truncation toward zero), check -3 < 0 → true
+  %neg = fsub double 0.0, 3.7
+  %i = fptosi double %neg to i64
+  %flag = icmp slt i64 %i, 0
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_fptosi(sim_type):
+    check_arith_result(FPTOSI_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_SITOFP — signed int to float
+# =========================================================================
+
+SITOFP_QIR = """
+  ; sitofp -5 → -5.0, then -5.0 < 0.0 → true
+  %neg5 = sub i64 0, 5
+  %f = sitofp i64 %neg5 to double
+  %zero = sitofp i64 0 to double
+  %flag = fcmp olt double %f, %zero
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_sitofp(sim_type):
+    check_arith_result(SITOFP_QIR, "1", sim_type=sim_type)
+
+
+# #########################################################################
+#  SSA / Data Movement  (OP_PHI → OP_CONST)
+# #########################################################################
+
+
+# =========================================================================
+# OP_PHI — phi node
+# =========================================================================
+
+PHI_LOOP_QIR = """
+entry:
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %next, %loop ]
+  %next = add i64 %i, 1
+  %cond = icmp slt i64 %next, 5
+  br i1 %cond, label %loop, label %done
+
+done:
+  ; %next should be 5 here
+  %flag = icmp eq i64 %next, 5
+  br i1 %flag, label %apply_x, label %measure
+
+apply_x:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_phi_loop_counter(sim_type):
+    check_result(PHI_LOOP_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_SELECT
+# =========================================================================
+
+SELECT_TRUE_QIR = """
+  ; select i1 true, i64 1, i64 0 → 1, then icmp eq 1, 1 → true
+  %s = select i1 true, i64 1, i64 0
+  %flag = icmp eq i64 %s, 1
+"""
+
+SELECT_FALSE_QIR = """
+  ; select i1 false, i64 1, i64 0 → 0, then icmp eq 0, 0 → true
+  %s = select i1 false, i64 1, i64 0
+  %flag = icmp eq i64 %s, 0
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_select_true(sim_type):
+    check_arith_result(SELECT_TRUE_QIR, "1", sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_select_false(sim_type):
+    check_arith_result(SELECT_FALSE_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_CONST — constant materialization
+# =========================================================================
+
+CONST_QIR = """
+  ; Use a specific constant 12345, check add identity
+  %a = add i64 12345, 0
+  %flag = icmp eq i64 %a, 12345
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_const(sim_type):
+    check_arith_result(CONST_QIR, "1", sim_type=sim_type)
+
+
+# #########################################################################
+#  Boolean (i1) variants of bitwise ops
+# #########################################################################
+
+
+# =========================================================================
+# OP_AND with i1 (boolean AND) — used in classical boolean logic
+# =========================================================================
+
+AND_I1_QIR = """
+entry:
+  ; Prepare both qubits in |1⟩ deterministically
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  %r0 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 0 to %Result*))
+  %r1 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))
+  %both = and i1 %r0, %r1
+  ; both should be true (1 AND 1 = 1), apply X → measure 1
+  br i1 %both, label %then, label %measure
+
+then:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_and_i1_boolean(sim_type):
+    """Deterministic boolean AND: both qubits |1⟩ → and i1 true, true → X → 1."""
+    check_result(AND_I1_QIR, "1", num_qubits=2, num_results=3, record=[2], sim_type=sim_type)
+
+
+# =========================================================================
+# OP_OR with i1 (boolean OR)
+# =========================================================================
+
+OR_I1_QIR = """
+entry:
+  ; q0 = |1⟩, q1 = |0⟩
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  %r0 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 0 to %Result*))
+  %r1 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))
+  %either = or i1 %r0, %r1
+  ; true OR false = true → X → measure 1
+  br i1 %either, label %then, label %measure
+then:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_or_i1_boolean(sim_type):
+    """Deterministic boolean OR: q0=1, q1=0 → or i1 true, false → true → X → 1."""
+    check_result(OR_I1_QIR, "1", num_qubits=2, num_results=3, record=[2], sim_type=sim_type)
+
+
+# =========================================================================
+# OP_XOR with i1 (boolean XOR / NOT)
+# =========================================================================
+
+XOR_NOT_QIR = """
+entry:
+  ; q0 = |0⟩ → measure 0
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  %r0 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 0 to %Result*))
+  ; XOR with true is NOT: false XOR true = true
+  %not_r0 = xor i1 %r0, true
+  br i1 %not_r0, label %then, label %measure
+
+then:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_xor_i1_not(sim_type):
+    """XOR i1 used as NOT: measure 0 → XOR true → true → X → 1."""
+    check_result(XOR_NOT_QIR, "1", num_qubits=1, num_results=2, record=[1], sim_type=sim_type)
+
+
+# #########################################################################
+#  Compound / Integration Tests
+# #########################################################################
+
+
+# =========================================================================
+# Chained arithmetic — complex expression
+# =========================================================================
+
+CHAINED_ARITH_QIR = """
+  ; (3 + 4) * 2 - 1 = 13, check 13 == 13 → true
+  %a = add i64 3, 4
+  %b = mul i64 %a, 2
+  %c = sub i64 %b, 1
+  %flag = icmp eq i64 %c, 13
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_chained_arithmetic(sim_type):
+    check_arith_result(CHAINED_ARITH_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_PHI with multiple predecessors (diamond CFG)
+# =========================================================================
+
+PHI_DIAMOND_QIR = """
+entry:
+  %c = icmp eq i64 1, 1
+  br i1 %c, label %left, label %right
+left:
+  br label %merge
+right:
+  br label %merge
+merge:
+  ; From left: 42, from right: 0. Since condition is true, we go left → 42.
+  %v = phi i64 [ 42, %left ], [ 0, %right ]
+  %flag = icmp eq i64 %v, 42
+  br i1 %flag, label %apply_x, label %measure
+apply_x:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_phi_diamond(sim_type):
+    """Diamond CFG with phi: true branch → phi resolves to 42 → X → 1."""
+    check_result(PHI_DIAMOND_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_SELECT with computed condition
+# =========================================================================
+
+SELECT_COMPUTED_QIR = """
+  ; 5 > 3 is true → select returns 10, check 10 == 10 → true
+  %cmp = icmp sgt i64 5, 3
+  %s = select i1 %cmp, i64 10, i64 20
+  %flag = icmp eq i64 %s, 10
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_select_computed(sim_type):
+    check_arith_result(SELECT_COMPUTED_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# Nested loop — OP_PHI + OP_BRANCH + OP_ADD + OP_ICMP combined
+# =========================================================================
+
+NESTED_LOOP_SUM_QIR = """
+entry:
+  br label %loop
+loop:
+  %i = phi i64 [ 1, %entry ], [ %next_i, %loop ]
+  %sum = phi i64 [ 0, %entry ], [ %next_sum, %loop ]
+  %next_sum = add i64 %sum, %i
+  %next_i = add i64 %i, 1
+  %cond = icmp sle i64 %next_i, 5
+  br i1 %cond, label %loop, label %done
+done:
+  ; %next_sum should be 15
+  %flag = icmp eq i64 %next_sum, 15
+  br i1 %flag, label %apply_x, label %measure
+apply_x:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_nested_loop_sum(sim_type):
+    """Sum 1..5 using phi loop, check total == 15."""
+    check_result(NESTED_LOOP_SUM_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_QUANTUM_GATE — dynamic qubit addressing in a loop (GHZ-like)
+# =========================================================================
+
+DYNAMIC_QUBIT_LOOP_QIR = """
+entry:
+  ; Create |+⟩ on q0
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %loop
+loop:
+  %i = phi i64 [ 1, %entry ], [ %next_i, %loop ]
+  %qi = inttoptr i64 %i to %Qubit*
+  call void @__quantum__qis__cnot__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Qubit* %qi)
+  %next_i = add i64 %i, 1
+  %cond = icmp sle i64 %next_i, 2
+  br i1 %cond, label %loop, label %measure
+measure:
+  ; Measure all 3 qubits — GHZ state means all agree
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_dynamic_qubit_loop(sim_type):
+    """3-qubit GHZ via dynamic qubit loop — only '000' and '111' should appear."""
+    qir = format_qir(DYNAMIC_QUBIT_LOOP_QIR, num_qubits=3, num_results=3)
+    results = _run(qir, shots=5000, seed=42, sim_type=sim_type)
+    counts = Counter(results)
+    assert set(counts.keys()) <= {"000", "111"}, f"Unexpected GHZ outcomes: {counts}"
+    assert counts.get("000", 0) > 1500
+    assert counts.get("111", 0) > 1500
+
+
+# =========================================================================
+# OP_SHL + OP_OR combined — bit packing
+# =========================================================================
+
+BIT_PACK_QIR = """
+  ; Pack bits: (1 << 2) | 1 = 5, check 5 == 5 → true
+  %shifted = shl i64 1, 2
+  %packed = or i64 %shifted, 1
+  %flag = icmp eq i64 %packed, 5
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_bit_packing(sim_type):
+    check_arith_result(BIT_PACK_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# Combined test: all shift and bitwise ops in sequence
+# =========================================================================
+
+SHIFT_BITWISE_CHAIN_QIR = """
+  ; Start with 0b1010 = 10
+  ; SHL by 1 → 0b10100 = 20
+  ; OR with 0b00011 = 3 → 0b10111 = 23
+  ; AND with 0b11110 = 30 → 0b10110 = 22
+  ; XOR with 0b00010 = 2 → 0b10100 = 20
+  ; LSHR by 2 → 0b00101 = 5
+  %step1 = shl i64 10, 1
+  %step2 = or i64 %step1, 3
+  %step3 = and i64 %step2, 30
+  %step4 = xor i64 %step3, 2
+  %step5 = lshr i64 %step4, 2
+  %flag = icmp eq i64 %step5, 5
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_shift_bitwise_chain(sim_type):
+    check_arith_result(SHIFT_BITWISE_CHAIN_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_SWITCH with computed value from arithmetic
+# =========================================================================
+
+SWITCH_ARITH_QIR = """
+entry:
+  ; Compute 2 * 3 - 4 = 2
+  %a = mul i64 2, 3
+  %val = sub i64 %a, 4
+  switch i64 %val, label %default [
+    i64 0, label %case0
+    i64 1, label %case1
+    i64 2, label %case2
+    i64 3, label %case3
+  ]
+case0:
+  br label %measure
+case1:
+  br label %measure
+case2:
+  ; Expected path
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+case3:
+  br label %measure
+default:
+  br label %measure
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_switch_from_arithmetic(sim_type):
+    """Switch on computed value 2*3-4=2 → case2 → X → 1."""
+    check_result(SWITCH_ARITH_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# Float: sitofp → fadd → fptosi round-trip
+# =========================================================================
+
+FLOAT_ROUNDTRIP_QIR = """
+  ; sitofp 7 → 7.0, fadd 7.0 + 3.0 → 10.0, fptosi → 10, check == 10
+  %f = sitofp i64 7 to double
+  %three = fadd double 0.0, 3.0
+  %sum = fadd double %f, %three
+  %i = fptosi double %sum to i64
+  %flag = icmp eq i64 %i, 10
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_float_roundtrip(sim_type):
+    check_arith_result(FLOAT_ROUNDTRIP_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_CALL with return value
+# =========================================================================
+
+CALL_WITH_RETVAL_QIR = """
+entry:
+  %result = call i64 @add_numbers(i64 3, i64 4)
+  %flag = icmp eq i64 %result, 7
+  br i1 %flag, label %then, label %measure
+then:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+CALL_WITH_RETVAL_QIR_FN = """
+define i64 @add_numbers(i64 %a, i64 %b) {
+entry:
+  %sum = add i64 %a, %b
+  ret i64 %sum
+}
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_call_with_return_value(sim_type):
+    """Call a function returning i64, use result in comparison."""
+    check_result(CALL_WITH_RETVAL_QIR, "1", extra_decls=CALL_WITH_RETVAL_QIR_FN, sim_type=sim_type)
+
+
+# =========================================================================
+# OP_MUL + OP_UDIV + OP_UREM combined
+# =========================================================================
+
+MUL_DIV_REM_QIR = """
+  ; 17 / 5 = 3 (udiv), 17 % 5 = 2 (urem), 3 * 5 + 2 = 17
+  %q = udiv i64 17, 5
+  %r = urem i64 17, 5
+  %product = mul i64 %q, 5
+  %reconstructed = add i64 %product, %r
+  %flag = icmp eq i64 %reconstructed, 17
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_mul_div_rem_identity(sim_type):
+    """Division identity: (a/b)*b + (a%b) == a."""
+    check_arith_result(MUL_DIV_REM_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# OP_MEASURE with mid-circuit branch (measure-and-correct pattern)
+# =========================================================================
+
+MEASURE_BRANCH_QIR = """
+entry:
+  ; Deterministically put qubit in |1⟩
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  ; Measure (should be 1) and reset to |0⟩
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  %r = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 0 to %Result*))
+  ; Since r=1, branch to 'correct' which applies X to restore |1⟩
+  br i1 %r, label %correct, label %measure
+
+correct:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %measure
+
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_measure_and_branch(sim_type):
+    """Deterministic measure-and-correct: X→MResetZ→read_result→X→MResetZ → always 1."""
+    check_result(MEASURE_BRANCH_QIR, "1", num_results=2, record=[1], sim_type=sim_type)
+
+
+# =========================================================================
+# OP_ADD with register-register (no immediates)
+# =========================================================================
+
+ADD_REG_REG_QIR = """
+  ; Use computed values in registers, not just immediates
+  %a = add i64 2, 1
+  %b = add i64 3, 1
+  %c = add i64 %a, %b
+  ; 3 + 4 = 7
+  %flag = icmp eq i64 %c, 7
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_add_register_register(sim_type):
+    check_arith_result(ADD_REG_REG_QIR, "1", sim_type=sim_type)
+
+
+# #########################################################################
+#  Regression tests — exercising specific edge-cases that previously failed
+# #########################################################################
+
+
+# =========================================================================
+# SREM with negative dividend
+# =========================================================================
+
+SREM_NEG_DIVIDEND_QIR = """
+  ; -7 % 2 = -1, verify result < 0
+  %neg7 = sub i64 0, 7
+  %a = srem i64 %neg7, 2
+  %flag = icmp slt i64 %a, 0
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_srem_negative_dividend(sim_type):
+    """srem must preserve the sign of the dividend."""
+    check_arith_result(SREM_NEG_DIVIDEND_QIR, "1", sim_type=sim_type)
+
+
+SREM_NEG_BOTH_QIR = """
+  ; -10 % -3 = -1  (sign follows dividend)
+  %neg10 = sub i64 0, 10
+  %neg3 = sub i64 0, 3
+  %a = srem i64 %neg10, %neg3
+  %neg1 = sub i64 0, 1
+  %flag = icmp eq i64 %a, %neg1
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_srem_negative_both(sim_type):
+    """srem with both operands negative."""
+    check_arith_result(SREM_NEG_BOTH_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# SEXT from i1  (sign-extension must convert 1 → -1)
+# =========================================================================
+
+SEXT_I1_FALSE_QIR = """
+  ; sext i1 false to i64 → 0, check 0 == 0 → true
+  %s = sext i1 false to i64
+  %flag = icmp eq i64 %s, 0
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_sext_i1_false(sim_type):
+    """sext of false (i1 0) must be 0."""
+    check_arith_result(SEXT_I1_FALSE_QIR, "1", sim_type=sim_type)
+
+
+SEXT_I1_RUNTIME_QIR = """
+  ; compute i1 true at runtime, sext → -1, check < 0
+  %one = add i64 1, 0
+  %b = icmp eq i64 %one, 1
+  %s = sext i1 %b to i64
+  %flag = icmp slt i64 %s, 0
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_sext_i1_runtime(sim_type):
+    """sext of a runtime i1 true value must also sign-extend to -1."""
+    check_arith_result(SEXT_I1_RUNTIME_QIR, "1", sim_type=sim_type)
+
+
+# =========================================================================
+# Call to IR-defined function with inttoptr constant argument
+# =========================================================================
+
+CALL_INTTOPTR_ARG_QIR = """
+entry:
+  call void @apply_h_then_z_then_h(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+CALL_INTTOPTR_ARG_QIR_FN = """
+define void @apply_h_then_z_then_h(%Qubit* %q) {
+entry:
+  call void @__quantum__qis__h__body(%Qubit* %q)
+  call void @__quantum__qis__z__body(%Qubit* %q)
+  call void @__quantum__qis__h__body(%Qubit* %q)
+  ret void
+}
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_call_inttoptr_arg(sim_type):
+    """Call a helper with an inttoptr constant expression argument."""
+    check_result(CALL_INTTOPTR_ARG_QIR, "1", extra_decls=CALL_INTTOPTR_ARG_QIR_FN, sim_type=sim_type)
+
+
+# =========================================================================
+# SITOFP with negative value  (signed int → float)
+# =========================================================================
+
+SITOFP_NEG_QIR = """
+  ; sitofp -3 → -3.0, then -3.0 < 0.0 → true
+  %neg3 = sub i64 0, 3
+  %f = sitofp i64 %neg3 to double
+  %zero = sitofp i64 0 to double
+  %flag = fcmp olt double %f, %zero
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_sitofp_negative(sim_type):
+    """sitofp must correctly convert a negative integer."""
+    check_arith_result(SITOFP_NEG_QIR, "1", sim_type=sim_type)
+
+
+# #########################################################################
+#  Dynamic register file sizing (programs exceeding 128 registers)
+# #########################################################################
+
+
+def _run_openqasm(qasm_src: str, shots: int = SHOTS, seed: int = 42, sim_type: str = "cpu"):
+    """Compile OpenQASM source via the adaptive pass and run on the given simulator."""
+    qir = qsharp.openqasm.compile(
+        qasm_src,
+        output_semantics=qsharp.openqasm.OutputSemantics.OpenQasm,
+        target_profile=qsharp.TargetProfile.Adaptive_RIF,
+    )
+    results = run_qir(qir, shots, seed=seed, type=sim_type)
+    return [map_result_list_to_str(r) for r in results]
+
+
+# =========================================================================
+# Complex RUS loop — requires >128 registers after loop unrolling
+# =========================================================================
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_complex_rus_exceeds_128_registers(sim_type):
+    """A complex repeat-until-success pattern with 50 iterations.
+
+    The Q# compiler fully unrolls the loop for the Adaptive_RIF profile,
+    producing ~301 registers — well above the old fixed limit of 128.
+    This validates that dynamic register file sizing works correctly.
+    """
+    qasm_src = """\
+OPENQASM 3.0;
+include "stdgates.inc";
+qubit[4] q;
+bit c;
+int total = 0;
+int i = 0;
+while (i < 50) {
+    h q[0];
+    cx q[0], q[1];
+    c = measure q[0];
+    if (c) {
+        x q[1];
+        reset q[0];
+        total = total + 1;
+    }
+    h q[2];
+    cx q[2], q[3];
+    c = measure q[2];
+    if (c) {
+        x q[3];
+        reset q[2];
+        total = total + 1;
+    }
+    i = i + 1;
+}
+bit[4] result = measure q;
+"""
+    results = _run_openqasm(qasm_src, shots=100, sim_type=sim_type)
+    assert all(
+        len(r) >= 4 and all(c in "01" for c in r) for r in results
+    ), f"Unexpected result format: {results[:5]}"
diff --git a/source/pip/tests/test_adaptive_cpu_noise.py b/source/pip/tests/test_adaptive_cpu_noise.py
new file mode 100644
index 0000000000..4aec68ccf2
--- /dev/null
+++ b/source/pip/tests/test_adaptive_cpu_noise.py
@@ -0,0 +1,410 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Noise tests for the adaptive CPU bytecode interpreter.
+
+Each test targets noise injection by supplying hand-written Adaptive Profile
+QIR that exercises noise channels and encodes the expected result into a
+measurement outcome.
+
+This is a CPU counterpart to ``test_adaptive_gpu_noise.py``.
+"""
+
+from collections import Counter
+from typing import Optional, List
+import pytest
+import qsharp.openqasm
+
+from qsharp._simulation import run_qir, NoiseConfig, Result
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+SHOTS = 100
+
+SIM_TYPES = ["cpu", "clifford"]
+
+
+def map_result_list_to_str(results: List[Result]):
+    results_str = ""
+    for r in results:
+        match r:
+            case Result.Zero:
+                results_str += "0"
+            case Result.One:
+                results_str += "1"
+            case Result.Loss:
+                results_str += "L"
+    return results_str
+
+
+def get_histogram(
+    qir_fragment: str,
+    *,
+    extra_decls: str = "",
+    num_qubits: int = 1,
+    num_results: int = 1,
+    noise: Optional[NoiseConfig] = None,
+    record: Optional[List[int]] = None,
+    shots=SHOTS,
+    sim_type: str = "cpu",
+):
+    qir = format_qir(
+        qir_fragment,
+        extra_decls=extra_decls,
+        num_qubits=num_qubits,
+        num_results=num_results,
+        record=record,
+    )
+    results = map(
+        map_result_list_to_str, run_qir(qir, shots, noise, seed=42, type=sim_type)
+    )
+    return Counter(results)
+
+
+def check_result(
+    qir_fragment: str,
+    expected: str,
+    *,
+    extra_decls: str = "",
+    num_qubits: int = 1,
+    num_results: int = 1,
+    noise: Optional[NoiseConfig] = None,
+    record: Optional[List[int]] = None,
+    sim_type: str = "cpu",
+):
+    """Assert every shot produces *expected*."""
+    counts = get_histogram(
+        qir_fragment,
+        extra_decls=extra_decls,
+        num_qubits=num_qubits,
+        num_results=num_results,
+        noise=noise,
+        record=record,
+        sim_type=sim_type,
+    )
+
+    assert counts == {
+        expected: SHOTS
+    }, f"Expected all {SHOTS} shots to be '{expected}', got {counts}"
+
+
+_DECLS = """\
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*)
+declare void @__quantum__qis__mz__body(%Qubit*, %Result*) #1
+declare void @__quantum__qis__reset__body(%Qubit*)
+declare void @__quantum__qis__cx__body(%Qubit*, %Qubit*)
+declare void @__quantum__qis__z__body(%Qubit*)
+declare void @__quantum__qis__s__body(%Qubit*)
+declare void @__quantum__qis__t__body(%Qubit*)
+declare void @__quantum__qis__cz__body(%Qubit*, %Qubit*)
+declare void @__quantum__qis__rz__body(double, %Qubit*)
+declare i1 @__quantum__qis__read_result__body(%Result*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+declare void @__quantum__rt__initialize(i8*)
+"""
+
+
+def format_qir(
+    body: str,
+    *,
+    extra_decls: str = "",
+    num_qubits: int = 1,
+    num_results: int = 1,
+    record=None,
+):
+    if record is None:
+        record = range(num_results)
+    output_recording = (
+        f"  call void @__quantum__rt__tuple_record_output(i64 {len(record)}, i8* null)"
+    )
+    for result_id in record:
+        output_recording += f"\n  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 {result_id} to %Result*), i8* null)"
+
+    return f"""\
+%Result = type opaque
+%Qubit = type opaque
+
+define i64 @ENTRYPOINT__main() #0 {{
+{body}
+{output_recording}
+  ret i64 0
+}}
+
+{_DECLS}
+{extra_decls}
+attributes #0 = {{ "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="{num_qubits}" "required_num_results"="{num_results}" }}
+attributes #1 = {{ "irreversible" }}
+"""
+
+
+# The purpose of this test is to inject noise in an identity gate, and assert its behavior.
+# Since QIS does not specify an identity gate, we use CNOT and inject noise in the target qubit.
+I_QIR = """
+entry:
+  call void @__quantum__qis__cx__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+H_I_H_QIR = """
+entry:
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__cx__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_no_noise_on_i_yields_0(sim_type):
+    check_result(I_QIR, "0", num_qubits=2, sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_x_noise_on_i_yields_1(sim_type):
+    noise = NoiseConfig()
+    noise.cx.ix = 1.0
+    check_result(I_QIR, "1", num_qubits=2, noise=noise, sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_y_noise_on_i_yields_1(sim_type):
+    noise = NoiseConfig()
+    noise.cx.iy = 1.0
+    check_result(I_QIR, "1", num_qubits=2, noise=noise, sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_z_noise_on_i_yields_0(sim_type):
+    noise = NoiseConfig()
+    noise.cx.iz = 1.0
+    check_result(I_QIR, "0", num_qubits=2, noise=noise, sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_x_noise_on_h_i_h_yields_0(sim_type):
+    noise = NoiseConfig()
+    noise.cx.ix = 1.0
+    check_result(H_I_H_QIR, "0", num_qubits=2, noise=noise, sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_y_noise_on_h_i_h_yields_1(sim_type):
+    noise = NoiseConfig()
+    noise.cx.iy = 1.0
+    check_result(H_I_H_QIR, "1", num_qubits=2, noise=noise, sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_z_noise_on_h_i_h_yields_1(sim_type):
+    noise = NoiseConfig()
+    noise.cx.iz = 1.0
+    check_result(H_I_H_QIR, "1", num_qubits=2, noise=noise, sim_type=sim_type)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_probabilistic_x_noise(sim_type):
+    noise = NoiseConfig()
+    noise.cx.ix = 0.5
+    counts = get_histogram(I_QIR, shots=1000, num_qubits=2, noise=noise, sim_type=sim_type)
+
+    assert counts["0"] > 400, f"Expected ~500 '0' results, got {counts['0']}"
+    assert counts["1"] > 400, f"Expected ~500 '1' results, got {counts['1']}"
+
+
+QASM_WITH_CORRELATED_NOISE = """
+OPENQASM 3.0;
+include "stdgates.inc";
+
+@qdk.qir.noise_intrinsic
+gate test_noise_intrinsic q0, q1, q2 {}
+
+qubit[3] qs;
+x qs[1];
+test_noise_intrinsic qs[0], qs[1], qs[2];
+bit[3] res = measure qs;
+"""
+
+QIR_WITH_CORRELATED_NOISE = qsharp.openqasm.compile(
+    QASM_WITH_CORRELATED_NOISE,
+    output_semantics=qsharp.openqasm.OutputSemantics.OpenQasm,
+    target_profile=qsharp.TargetProfile.Adaptive_RIF,
+)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_noise_intrinsics_noiseless(sim_type):
+    output = run_qir(QIR_WITH_CORRELATED_NOISE, shots=1, noise=None, type=sim_type)
+    assert output == [[Result.Zero, Result.One, Result.Zero]]
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_noise_intrinsics_noisy(sim_type):
+    noise = NoiseConfig()
+    table = noise.intrinsic("test_noise_intrinsic", 3)
+    table.yyy = 1.0
+    output = run_qir(QIR_WITH_CORRELATED_NOISE, shots=1, noise=noise, type=sim_type)
+    assert output == [[Result.One, Result.Zero, Result.One]]
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_noise_intrinsics_load_csv_dir(sim_type):
+    noise = NoiseConfig()
+    noise.load_csv_dir("./csv_dir_test")
+    output = run_qir(QIR_WITH_CORRELATED_NOISE, shots=1, noise=noise, type=sim_type)
+    assert output == [[Result.One, Result.Zero, Result.One]]
+
+
+NOISE_INTRINSICS_WITH_REGISTERS_QIR = r"""
+%Result = type opaque
+%Qubit = type opaque
+
+@0 = internal constant [4 x i8] c"0_a\00"
+@1 = internal constant [6 x i8] c"1_a0r\00"
+@2 = internal constant [6 x i8] c"2_a1r\00"
+@3 = internal constant [6 x i8] c"3_a2r\00"
+
+define i64 @ENTRYPOINT__main() #0 {
+block_0:
+  %q1 = inttoptr i64 0 to %Qubit*
+  %q2 = inttoptr i64 1 to %Qubit*
+  %q3 = inttoptr i64 2 to %Qubit*
+  call void @__quantum__rt__initialize(i8* null)
+  call void @__quantum__qis__x__body(%Qubit* %q2)
+  call void @test_noise_intrinsic(%Qubit* %q1, %Qubit* %q2, %Qubit* %q3)
+  call void @__quantum__qis__m__body(%Qubit* %q1, %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__m__body(%Qubit* %q2, %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__qis__m__body(%Qubit* %q3, %Result* inttoptr (i64 2 to %Result*))
+  call void @__quantum__rt__array_record_output(i64 3, i8* getelementptr inbounds ([4 x i8], [4 x i8]* @0, i64 0, i64 0))
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 0 to %Result*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @1, i64 0, i64 0))
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @2, i64 0, i64 0))
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @3, i64 0, i64 0))
+  ret i64 0
+}
+
+declare void @__quantum__rt__initialize(i8*)
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @test_noise_intrinsic(%Qubit*, %Qubit*, %Qubit*) #2
+declare void @__quantum__qis__m__body(%Qubit*, %Result*) #1
+declare void @__quantum__rt__array_record_output(i64, i8*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "output_labeling_schema" "qir_profiles"="adaptive_profile" "required_num_qubits"="3" "required_num_results"="3" }
+attributes #1 = { "irreversible" }
+attributes #2 = { "qdk_noise" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
+
+!0 = !{i32 1, !"qir_major_version", i32 1}
+!1 = !{i32 7, !"qir_minor_version", i32 0}
+!2 = !{i32 1, !"dynamic_qubit_management", i1 false}
+!3 = !{i32 1, !"dynamic_result_management", i1 false}
+!4 = !{i32 5, !"int_computations", !{!"i64"}}
+!5 = !{i32 5, !"float_computations", !{!"double"}}
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_noise_intrinsics_with_registers_noisy(sim_type):
+    noise = NoiseConfig()
+    table = noise.intrinsic("test_noise_intrinsic", 3)
+    table.yyy = 1.0
+    output = run_qir(
+        NOISE_INTRINSICS_WITH_REGISTERS_QIR, shots=1, noise=noise, type=sim_type
+    )
+    assert output == [[Result.One, Result.Zero, Result.One]]
+
+
+# --- Tests for varied qubit counts (1, 2, 5) ---
+
+QASM_NOISE_1Q = """
+OPENQASM 3.0;
+include "stdgates.inc";
+
+@qdk.qir.noise_intrinsic
+gate noise_1q q0 {}
+
+qubit q;
+noise_1q q;
+bit res = measure q;
+"""
+
+QIR_NOISE_1Q = qsharp.openqasm.compile(
+    QASM_NOISE_1Q,
+    output_semantics=qsharp.openqasm.OutputSemantics.OpenQasm,
+    target_profile=qsharp.TargetProfile.Adaptive_RIF,
+)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_noise_intrinsic_1q_x_flip(sim_type):
+    noise = NoiseConfig()
+    table = noise.intrinsic("noise_1q", 1)
+    table.x = 1.0
+    output = run_qir(QIR_NOISE_1Q, shots=1, noise=noise, type=sim_type)
+    assert output == [[Result.One]]
+
+
+QASM_NOISE_2Q = """
+OPENQASM 3.0;
+include "stdgates.inc";
+
+@qdk.qir.noise_intrinsic
+gate noise_2q q0, q1 {}
+
+qubit[2] qs;
+x qs[0];
+noise_2q qs[0], qs[1];
+bit[2] res = measure qs;
+"""
+
+QIR_NOISE_2Q = qsharp.openqasm.compile(
+    QASM_NOISE_2Q,
+    output_semantics=qsharp.openqasm.OutputSemantics.OpenQasm,
+    target_profile=qsharp.TargetProfile.Adaptive_RIF,
+)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_noise_intrinsic_2q_xx_flip(sim_type):
+    noise = NoiseConfig()
+    table = noise.intrinsic("noise_2q", 2)
+    table.xx = 1.0
+    # qs[0] was |1>, qs[1] was |0> -> XX flips both -> qs[0]=|0>, qs[1]=|1>
+    output = run_qir(QIR_NOISE_2Q, shots=1, noise=noise, type=sim_type)
+    assert output == [[Result.Zero, Result.One]]
+
+
+QASM_NOISE_5Q = """
+OPENQASM 3.0;
+include "stdgates.inc";
+
+@qdk.qir.noise_intrinsic
+gate noise_5q q0, q1, q2, q3, q4 {}
+
+qubit[5] qs;
+x qs[1];
+x qs[3];
+noise_5q qs[0], qs[1], qs[2], qs[3], qs[4];
+bit[5] res = measure qs;
+"""
+
+QIR_NOISE_5Q = qsharp.openqasm.compile(
+    QASM_NOISE_5Q,
+    output_semantics=qsharp.openqasm.OutputSemantics.OpenQasm,
+    target_profile=qsharp.TargetProfile.Adaptive_RIF,
+)
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_noise_intrinsic_5q_xxxxx_flip(sim_type):
+    noise = NoiseConfig()
+    table = noise.intrinsic("noise_5q", 5)
+    table.xxxxx = 1.0
+    # Initial: |01010> -> XXXXX flips all -> |10101>
+    output = run_qir(QIR_NOISE_5Q, shots=1, noise=noise, type=sim_type)
+    assert output == [[Result.One, Result.Zero, Result.One, Result.Zero, Result.One]]
diff --git a/source/pip/tests/test_adaptive_cpu_quantum_ops.py b/source/pip/tests/test_adaptive_cpu_quantum_ops.py
new file mode 100644
index 0000000000..db34a07c5d
--- /dev/null
+++ b/source/pip/tests/test_adaptive_cpu_quantum_ops.py
@@ -0,0 +1,373 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""End-to-end tests for the adaptive CPU bytecode interpreter pipeline.
+
+Tests run Adaptive Profile QIR through the full pipeline:
+Python AdaptiveProfilePass → Rust receiver → CPU interpreter → results.
+
+This is a CPU counterpart to ``test_adaptive_gpu_quantum_ops.py``.
+
+For smaller tests covering the full Adaptive Profile instruction set,
+see ``test_adaptive_cpu_bytecode.py``.
+"""
+
+from collections import Counter
+
+import pytest
+
+from qsharp._simulation import run_qir, Result
+
+SIM_TYPES = ["cpu", "clifford"]
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+
+def map_result_list_to_str(results):
+    results_str = ""
+    for r in results:
+        match r:
+            case Result.Zero:
+                results_str += "0"
+            case Result.One:
+                results_str += "1"
+            case Result.Loss:
+                results_str += "L"
+    return results_str
+
+
+def _run(qir: str, shots: int, seed: int = 42, sim_type: str = "cpu"):
+    """Run *qir* on the given simulator and return shot results as a list of strings."""
+    results = run_qir(qir, shots, seed=seed, type=sim_type)
+    return [map_result_list_to_str(r) for r in results]
+
+
+# ---------------------------------------------------------------------------
+# QIR source
+# ---------------------------------------------------------------------------
+
+# Example 1: Measure-and-correct (H → MResetZ → read_result → branch → X)
+MEASURE_AND_CORRECT_QIR = """\
+%Result = type opaque
+%Qubit = type opaque
+
+define void @ENTRYPOINT__main() #0 {
+entry:
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  %r = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 0 to %Result*))
+  br i1 %r, label %then, label %end
+
+then:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %end
+
+end:
+  call void @__quantum__rt__tuple_record_output(i64 1, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 0 to %Result*), i8* null)
+  ret void
+}
+
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*)
+declare i1 @__quantum__qis__read_result__body(%Result*)
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="1" "required_num_results"="1" }
+"""
+
+# Example 3: Conditionally terminating loop
+CONDITIONAL_LOOP_QIR = """\
+%Result = type opaque
+%Qubit = type opaque
+
+define void @ENTRYPOINT__main() #0 {
+entry:
+  br label %loop
+
+loop:
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  %r = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 0 to %Result*))
+  br i1 %r, label %done, label %loop
+
+done:
+  call void @__quantum__rt__tuple_record_output(i64 1, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 0 to %Result*), i8* null)
+  ret void
+}
+
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*)
+declare i1 @__quantum__qis__read_result__body(%Result*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="1" "required_num_results"="1" }
+"""
+
+# Example 2: Loop with phi node — GHZ state preparation
+LOOP_WITH_PHI_QIR = """\
+%Result = type opaque
+%Qubit = type opaque
+
+define void @ENTRYPOINT__main() #0 {
+entry:
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %loop
+
+loop:
+  %i = phi i64 [ 1, %entry ], [ %next_i, %loop ]
+  %qi = inttoptr i64 %i to %Qubit*
+  call void @__quantum__qis__cnot__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Qubit* %qi)
+  %next_i = add i64 %i, 1
+  %cond = icmp sle i64 %next_i, 4
+  br i1 %cond, label %loop, label %measure
+
+measure:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 3 to %Qubit*), %Result* inttoptr (i64 3 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 4 to %Qubit*), %Result* inttoptr (i64 4 to %Result*))
+  call void @__quantum__rt__tuple_record_output(i64 5, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 0 to %Result*), i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 3 to %Result*), i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 4 to %Result*), i8* null)
+  ret void
+}
+
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__cnot__body(%Qubit*, %Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="5" "required_num_results"="5" }
+"""
+
+# Example 4: Classical boolean computation
+BOOLEAN_COMPUTATION_QIR = """\
+%Result = type opaque
+%Qubit = type opaque
+
+define void @ENTRYPOINT__main() #0 {
+entry:
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  %r0 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 0 to %Result*))
+  %r1 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))
+  %both = and i1 %r0, %r1
+  br i1 %both, label %then, label %else
+
+then:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  br label %end
+
+else:
+  br label %end
+
+end:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+  call void @__quantum__rt__tuple_record_output(i64 1, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* null)
+  ret void
+}
+
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*)
+declare i1 @__quantum__qis__read_result__body(%Result*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="2" "required_num_results"="3" }
+"""
+
+# Example 5: Teleport chain
+TELEPORT_CHAIN_QIR = """\
+%Result = type opaque
+%Qubit = type opaque
+
+@0 = internal constant [5 x i8] c"0_t0\\00"
+@1 = internal constant [5 x i8] c"0_t1\\00"
+
+define void @TeleportChain() #0 {
+entry:
+  call void @__quantum__rt__initialize(i8* null)
+  br label %body
+body:
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__cnot__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 2 to %Qubit*))
+  call void @__quantum__qis__cnot__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Qubit* inttoptr (i64 4 to %Qubit*))
+  call void @__quantum__qis__cnot__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Qubit* inttoptr (i64 2 to %Qubit*))
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__reset__body(%Qubit* inttoptr (i64 1 to %Qubit*))
+  %0 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 0 to %Result*))
+  br i1 %0, label %then__1, label %continue__1
+then__1:
+  call void @__quantum__qis__z__body(%Qubit* inttoptr (i64 4 to %Qubit*))
+  br label %continue__1
+continue__1:
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__qis__reset__body(%Qubit* inttoptr (i64 2 to %Qubit*))
+  %1 = call i1 @__quantum__qis__read_result__body(%Result* inttoptr (i64 1 to %Result*))
+  br i1 %1, label %then__2, label %continue__2
+then__2:
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 4 to %Qubit*))
+  br label %continue__2
+continue__2:
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+  call void @__quantum__qis__reset__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 4 to %Qubit*), %Result* inttoptr (i64 3 to %Result*))
+  call void @__quantum__qis__reset__body(%Qubit* inttoptr (i64 4 to %Qubit*))
+  br label %exit
+exit:
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @0, i32 0, i32 0))
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 3 to %Result*), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @1, i32 0, i32 0))
+  ret void
+}
+
+declare void @__quantum__qis__cnot__body(%Qubit*, %Qubit*)
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @__quantum__qis__z__body(%Qubit*)
+declare void @__quantum__qis__reset__body(%Qubit*)
+declare void @__quantum__qis__mz__body(%Qubit*, %Result*) #1
+declare void @__quantum__rt__initialize(i8*)
+declare i1 @__quantum__qis__read_result__body(%Result*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="5" "required_num_results"="4" }
+attributes #1 = { "irreversible" }
+"""
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_measure_and_correct_histogram(sim_type):
+    """Example 1: H → MResetZ → read_result → conditional X.
+
+    Run 10000 shots and verify ~50/50 split of "0" and "1" outcomes.
+    """
+    results = _run(MEASURE_AND_CORRECT_QIR, shots=10000, seed=42, sim_type=sim_type)
+    assert len(results) == 10000
+
+    counts = Counter(results)
+    count_0 = counts.get("0", 0)
+    count_1 = counts.get("1", 0)
+
+    assert count_0 > 4000, f"Expected ~5000 '0' results, got {count_0}"
+    assert count_1 > 4000, f"Expected ~5000 '1' results, got {count_1}"
+    assert count_0 + count_1 == 10000, "All shots should produce a result"
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_conditional_loop_all_results_are_one(sim_type):
+    """Example 3: The loop exits only when measurement yields 1.
+
+    Every shot's recorded result should be "1".
+    """
+    shots = 5000
+    results = _run(CONDITIONAL_LOOP_QIR, shots=shots, seed=99, sim_type=sim_type)
+    assert len(results) == shots
+
+    counts = Counter(results)
+    assert (
+        counts.get("1", 0) == shots
+    ), f"Expected all {shots} shots to produce '1', got counts: {counts}"
+
+
+# ---------------------------------------------------------------------------
+# Tests — Example 2: Loop with phi (GHZ state)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_loop_with_phi_ghz_histogram(sim_type):
+    """Example 2: H → loop CNOT(q0, q_i) for i=1..4 → measure all.
+
+    Creates (|00000⟩ + |11111⟩)/√2. All 5 measurements must agree.
+    """
+    results = _run(LOOP_WITH_PHI_QIR, shots=10000, seed=42, sim_type=sim_type)
+    assert len(results) == 10000
+
+    counts = Counter(results)
+    assert set(counts.keys()) <= {
+        "00000",
+        "11111",
+    }, f"Unexpected outcomes in GHZ state: {counts}"
+
+    count_00000 = counts.get("00000", 0)
+    count_11111 = counts.get("11111", 0)
+
+    assert count_00000 > 4000, f"Expected ~5000 '00000' results, got {count_00000}"
+    assert count_11111 > 4000, f"Expected ~5000 '11111' results, got {count_11111}"
+    assert count_00000 + count_11111 == 10000, "All shots should produce a result"
+
+
+# ---------------------------------------------------------------------------
+# Tests — Example 4: Boolean computation (AND gate)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_boolean_computation_histogram(sim_type):
+    """Example 4: H(q0), H(q1) → MResetZ both → AND results → conditional X.
+
+    r2=1 only when both r0=1 AND r1=1 (~25% of shots).
+    """
+    results = _run(BOOLEAN_COMPUTATION_QIR, shots=10000, seed=42, sim_type=sim_type)
+    assert len(results) == 10000
+
+    counts = Counter(results)
+    count_0 = counts.get("0", 0)
+    count_1 = counts.get("1", 0)
+
+    assert 1500 < count_1 < 3500, f"Expected ~2500 '1' results (~25%), got {count_1}"
+    assert 6500 < count_0 < 8500, f"Expected ~7500 '0' results (~75%), got {count_0}"
+    assert count_0 + count_1 == 10000, "All shots should produce a result"
+
+
+# ---------------------------------------------------------------------------
+# Tests — Example 5: Teleport chain
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_teleport_chain_histogram(sim_type):
+    """Example 5: Teleport chain with 2 Bell pairs and measure-and-correct.
+
+    Final measurements of q0 and q4 should be correlated:
+    both "0" or both "1", near 50/50.
+    """
+    results = _run(TELEPORT_CHAIN_QIR, shots=10000, seed=42, sim_type=sim_type)
+    assert len(results) == 10000
+
+    counts = Counter(results)
+    assert set(counts.keys()) <= {
+        "00",
+        "11",
+    }, f"Unexpected outcomes in teleport chain: {counts}"
+
+    count_00 = counts.get("00", 0)
+    count_11 = counts.get("11", 0)
+
+    assert count_00 > 4000, f"Expected ~5000 '00' results, got {count_00}"
+    assert count_11 > 4000, f"Expected ~5000 '11' results, got {count_11}"
+    assert count_00 + count_11 == 10000, "All shots should produce a result"
diff --git a/source/pip/tests/test_adaptive_gpu_noise.py b/source/pip/tests/test_adaptive_gpu_noise.py
index e44d55294c..819f773b30 100644
--- a/source/pip/tests/test_adaptive_gpu_noise.py
+++ b/source/pip/tests/test_adaptive_gpu_noise.py
@@ -228,7 +228,7 @@ def test_z_noise_on_h_i_h_yields_1():
 def test_probabilistic_x_noise():
     noise = NoiseConfig()
     noise.cx.ix = 0.5
-    counts = get_histogram(I_QIR, shots=1000, noise=noise)
+    counts = get_histogram(I_QIR, shots=1000, num_qubits=2, noise=noise)
 
     assert counts["0"] > 400, f"Expected ~500 '0' results, got {counts['0']}"
     assert counts["1"] > 400, f"Expected ~500 '1' results, got {counts['1']}"
diff --git a/source/pip/tests/test_clifford_simulator.py b/source/pip/tests/test_clifford_simulator.py
index 2c47fc0c8e..76acfd934e 100644
--- a/source/pip/tests/test_clifford_simulator.py
+++ b/source/pip/tests/test_clifford_simulator.py
@@ -101,7 +101,7 @@ def test_s_adj_noise_inherits_from_rz():
     assert output == [Result.One]
 
 
-def test_program_with_branching_fails():
+def test_program_with_branching_succeeds():
     qsharp.init(target_profile=TargetProfile.Adaptive_RI)
     qsharp.eval(
         """
@@ -116,14 +116,8 @@ def test_program_with_branching_fails():
         """
     )
     ir = qsharp.compile("Main()")
-    try:
-        run_qir_clifford(str(ir), 1, NoiseConfig())
-        assert False, "Expected ValueError for branching control flow"
-    except ValueError as e:
-        assert (
-            "simulation of programs with branching control flow is not supported"
-            in str(e)
-        )
+    results = run_qir_clifford(str(ir), 1, NoiseConfig())
+    assert len(results) == 1
 
 
 def test_program_with_unconditional_branching_succeeds():
diff --git a/source/simulators/src/bytecode.rs b/source/simulators/src/bytecode.rs
index 6fb246e857..afc77f5685 100644
--- a/source/simulators/src/bytecode.rs
+++ b/source/simulators/src/bytecode.rs
@@ -5,6 +5,8 @@
 //!
 //! Values must stay in sync with the Python `_adaptive_opcodes.py` module.
 
+pub mod runtime;
+
 use bytemuck::{Pod, Zeroable};
 use num_traits::Unsigned;
 
@@ -167,8 +169,8 @@ impl<Word: Default> Function<Word> {
 #[repr(C)]
 #[derive(Copy, Clone, Debug, Default, Zeroable)]
 pub struct PhiNodeEntry<Word> {
-    block_id: Word,
-    val_reg: Word,
+    pub block_id: Word,
+    pub val_reg: Word,
 }
 
 impl<Word> PhiNodeEntry<Word> {
@@ -186,8 +188,8 @@ impl<Word> PhiNodeEntry<Word> {
 #[repr(C)]
 #[derive(Copy, Clone, Debug, Default, Zeroable)]
 pub struct SwitchCase<Word> {
-    case_val: Word,
-    target_block: Word,
+    pub case_val: Word,
+    pub target_block: Word,
 }
 
 impl<Word> SwitchCase<Word> {
diff --git a/source/simulators/src/bytecode/runtime.rs b/source/simulators/src/bytecode/runtime.rs
new file mode 100644
index 0000000000..db0d0e6894
--- /dev/null
+++ b/source/simulators/src/bytecode/runtime.rs
@@ -0,0 +1,690 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+//! CPU bytecode interpreter for the Adaptive Profile QIR.
+
+// The interpreter intentionally uses u64 registers and must cast between u64, i64,
+// usize, and u32 pervasively. These casts are correct by construction (values come
+// from a well-formed bytecode program). Suppressing the pedantic clippy lints here
+// keeps the opcode dispatch readable.
+#![allow(
+    clippy::cast_possible_truncation,
+    clippy::cast_possible_wrap,
+    clippy::cast_precision_loss,
+    clippy::cast_sign_loss,
+    clippy::float_cmp,
+    clippy::match_same_arms,
+    clippy::single_match_else,
+    clippy::too_many_lines
+)]
+
+use crate::{
+    MeasurementResult, Simulator,
+    bytecode::{AdaptiveProgram, Instruction},
+};
+
+// ---------------------------------------------------------------------------
+// Opcode constants — must stay in sync with the Python `_adaptive_bytecode.py`
+// and the WGSL `simulator_adaptive.wgsl` shader.
+// ---------------------------------------------------------------------------
+
+// Flags (pre-shifted to bit 16+)
+const FLAG_SRC0_IMM: u64 = 1 << 16;
+const FLAG_SRC1_IMM: u64 = 1 << 17;
+const FLAG_DST_IMM: u64 = 1 << 18;
+const FLAG_AUX0_IMM: u64 = 1 << 19;
+const FLAG_AUX1_IMM: u64 = 1 << 20;
+const FLAG_AUX2_IMM: u64 = 1 << 21;
+const FLAG_AUX3_IMM: u64 = 1 << 22;
+
+// Control flow
+const OP_NOP: u8 = 0x00;
+const OP_RET: u8 = 0x02;
+const OP_JUMP: u8 = 0x04;
+const OP_BRANCH: u8 = 0x05;
+const OP_SWITCH: u8 = 0x06;
+const OP_CALL: u8 = 0x07;
+const OP_CALL_RETURN: u8 = 0x08;
+
+// Quantum
+const OP_QUANTUM_GATE: u8 = 0x10;
+const OP_MEASURE: u8 = 0x11;
+const OP_RESET: u8 = 0x12;
+const OP_READ_RESULT: u8 = 0x13;
+const OP_RECORD_OUTPUT: u8 = 0x14;
+
+// Integer arithmetic
+const OP_ADD: u8 = 0x20;
+const OP_SUB: u8 = 0x21;
+const OP_MUL: u8 = 0x22;
+const OP_UDIV: u8 = 0x23;
+const OP_SDIV: u8 = 0x24;
+const OP_UREM: u8 = 0x25;
+const OP_SREM: u8 = 0x26;
+
+// Bitwise / shift
+const OP_AND: u8 = 0x28;
+const OP_OR: u8 = 0x29;
+const OP_XOR: u8 = 0x2A;
+const OP_SHL: u8 = 0x2B;
+const OP_LSHR: u8 = 0x2C;
+const OP_ASHR: u8 = 0x2D;
+
+// Comparison
+const OP_ICMP: u8 = 0x30;
+const OP_FCMP: u8 = 0x31;
+
+// Float arithmetic
+const OP_FADD: u8 = 0x38;
+const OP_FSUB: u8 = 0x39;
+const OP_FMUL: u8 = 0x3A;
+const OP_FDIV: u8 = 0x3B;
+
+// Type conversion
+const OP_ZEXT: u8 = 0x40;
+const OP_SEXT: u8 = 0x41;
+const OP_TRUNC: u8 = 0x42;
+const OP_FPEXT: u8 = 0x43;
+const OP_FPTRUNC: u8 = 0x44;
+const OP_INTTOPTR: u8 = 0x45;
+const OP_FPTOSI: u8 = 0x46;
+const OP_SITOFP: u8 = 0x47;
+
+// SSA / data movement
+const OP_PHI: u8 = 0x50;
+const OP_SELECT: u8 = 0x51;
+const OP_MOV: u8 = 0x52;
+const OP_CONST: u8 = 0x53;
+
+// ICmp condition codes (sub-opcode)
+const ICMP_EQ: u8 = 0;
+const ICMP_NE: u8 = 1;
+const ICMP_SLT: u8 = 2;
+const ICMP_SLE: u8 = 3;
+const ICMP_SGT: u8 = 4;
+const ICMP_SGE: u8 = 5;
+const ICMP_ULT: u8 = 6;
+const ICMP_ULE: u8 = 7;
+const ICMP_UGT: u8 = 8;
+const ICMP_UGE: u8 = 9;
+
+// FCmp condition codes (sub-opcode)
+const FCMP_OEQ: u8 = 1;
+const FCMP_OGT: u8 = 2;
+const FCMP_OGE: u8 = 3;
+const FCMP_OLT: u8 = 4;
+const FCMP_OLE: u8 = 5;
+const FCMP_ONE: u8 = 6;
+
+// Quantum op IDs — must match `shader_types.rs` `OpID` and `GATE_MAP` in `_adaptive_pass.py`.
+const OPID_RESETZ: u64 = 1;
+const OPID_X: u64 = 2;
+const OPID_Y: u64 = 3;
+const OPID_Z: u64 = 4;
+const OPID_H: u64 = 5;
+const OPID_S: u64 = 6;
+const OPID_S_ADJ: u64 = 7;
+const OPID_T: u64 = 8;
+const OPID_T_ADJ: u64 = 9;
+const OPID_SX: u64 = 10;
+const OPID_SX_ADJ: u64 = 11;
+const OPID_RX: u64 = 12;
+const OPID_RY: u64 = 13;
+const OPID_RZ: u64 = 14;
+const OPID_CX: u64 = 15;
+const OPID_CZ: u64 = 16;
+const OPID_RXX: u64 = 17;
+const OPID_RYY: u64 = 18;
+const OPID_RZZ: u64 = 19;
+const OPID_MZ: u64 = 21;
+const OPID_MRESETZ: u64 = 22;
+const OPID_SWAP: u64 = 24;
+const OPID_MOVE: u64 = 28;
+const OPID_CY: u64 = 29;
+const OPID_CORRELATED_NOISE: u64 = 131;
+
+// Sentinel
+const VOID_RETURN: u64 = 0xFFFF_FFFF;
+
+// ---------------------------------------------------------------------------
+// Runtime state
+// ---------------------------------------------------------------------------
+
+struct CallStackFrame {
+    block_id: u64,
+    return_pc: u64,
+    return_reg: u64,
+}
+
+struct Runtime {
+    pc: u64,
+    current_block_id: u64,
+    previous_block_id: u64,
+    exit_code: u64,
+    registers: Vec<u64>,
+    call_stack: Vec<CallStackFrame>,
+}
+
+impl Runtime {
+    fn new(num_registers: u32, entry_block: u64, entry_pc: u64) -> Self {
+        Self {
+            pc: entry_pc,
+            current_block_id: entry_block,
+            previous_block_id: 0,
+            exit_code: 0,
+            registers: vec![0; num_registers as usize],
+            call_stack: Vec::with_capacity(128),
+        }
+    }
+
+    fn read_reg(&self, reg: u64) -> u64 {
+        self.registers[reg as usize]
+    }
+
+    fn write_reg(&mut self, reg: u64, val: u64) {
+        self.registers[reg as usize] = val;
+    }
+
+    fn resolve_u64(&self, operand: u64, flags: u64, operand_idx: u64) -> u64 {
+        let imm_flag = match operand_idx {
+            0 => FLAG_SRC0_IMM,
+            1 => FLAG_SRC1_IMM,
+            2 => FLAG_DST_IMM,
+            3 => FLAG_AUX0_IMM,
+            4 => FLAG_AUX1_IMM,
+            5 => FLAG_AUX2_IMM,
+            6 => FLAG_AUX3_IMM,
+            _ => panic!("invalid operand index {operand_idx}"),
+        };
+        if flags & imm_flag != 0 {
+            operand
+        } else {
+            self.read_reg(operand)
+        }
+    }
+
+    fn resolve_i64(&self, operand: u64, flags: u64, operand_idx: u64) -> i64 {
+        self.resolve_u64(operand, flags, operand_idx) as i64
+    }
+
+    fn resolve_f64(&self, operand: u64, flags: u64, operand_idx: u64) -> f64 {
+        f64::from_bits(self.resolve_u64(operand, flags, operand_idx))
+    }
+
+    fn write_f64(&mut self, reg: u64, val: f64) {
+        self.write_reg(reg, val.to_bits());
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Quantum op dispatch
+// ---------------------------------------------------------------------------
+
+fn dispatch_quantum_gate<S: Simulator>(
+    program: &AdaptiveProgram<u64>,
+    sim: &mut S,
+    instr: &Instruction<u64>,
+    rt: &Runtime,
+) {
+    let op_idx = instr.aux0 as usize;
+    let op = &program.quantum_ops[op_idx];
+    let op_id = op.op_id;
+
+    match op_id {
+        OPID_CORRELATED_NOISE => {
+            let qubit_count = rt.resolve_u64(instr.aux1, instr.opcode, 4) as usize;
+            let arg_offset = rt.resolve_u64(instr.aux2, instr.opcode, 5) as usize;
+            let table_id = op.q1 as u32;
+            let targets: Vec<usize> = (0..qubit_count)
+                .map(|i| rt.read_reg(program.call_args[arg_offset + i]) as usize)
+                .collect();
+            sim.correlated_noise_intrinsic(table_id, &targets);
+        }
+        _ => {
+            let q1 = rt.resolve_u64(instr.aux1, instr.opcode, 4) as usize;
+            let q2 = rt.resolve_u64(instr.aux2, instr.opcode, 5) as usize;
+            let angle = op.angle;
+            match op_id {
+                OPID_X => sim.x(q1),
+                OPID_Y => sim.y(q1),
+                OPID_Z => sim.z(q1),
+                OPID_H => sim.h(q1),
+                OPID_S => sim.s(q1),
+                OPID_S_ADJ => sim.s_adj(q1),
+                OPID_T => sim.t(q1),
+                OPID_T_ADJ => sim.t_adj(q1),
+                OPID_SX => sim.sx(q1),
+                OPID_SX_ADJ => sim.sx_adj(q1),
+                OPID_RX => sim.rx(angle, q1),
+                OPID_RY => sim.ry(angle, q1),
+                OPID_RZ => sim.rz(angle, q1),
+                OPID_CX => sim.cx(q1, q2),
+                OPID_CY => sim.cy(q1, q2),
+                OPID_CZ => sim.cz(q1, q2),
+                OPID_RXX => sim.rxx(angle, q1, q2),
+                OPID_RYY => sim.ryy(angle, q1, q2),
+                OPID_RZZ => sim.rzz(angle, q1, q2),
+                OPID_SWAP => sim.swap(q1, q2),
+                OPID_MOVE => sim.mov(q1),
+                _ => panic!("unsupported quantum gate op_id={op_id}"),
+            }
+        }
+    }
+}
+
+fn dispatch_measure<S: Simulator>(
+    program: &AdaptiveProgram<u64>,
+    sim: &mut S,
+    instr: &Instruction<u64>,
+    rt: &Runtime,
+) {
+    let op_idx = instr.aux0 as usize;
+    let op = &program.quantum_ops[op_idx];
+    let qubit = rt.resolve_u64(instr.aux1, instr.opcode, 4) as usize;
+    let result_id = rt.resolve_u64(instr.aux2, instr.opcode, 5) as usize;
+
+    match op.op_id {
+        OPID_MZ => sim.mz(qubit, result_id),
+        OPID_MRESETZ => sim.mresetz(qubit, result_id),
+        _ => panic!("unsupported measure op_id={}", op.op_id),
+    }
+}
+
+fn dispatch_reset<S: Simulator>(
+    program: &AdaptiveProgram<u64>,
+    sim: &mut S,
+    instr: &Instruction<u64>,
+    rt: &Runtime,
+) {
+    let op_idx = instr.aux0 as usize;
+    let op = &program.quantum_ops[op_idx];
+    let qubit = rt.resolve_u64(instr.aux1, instr.opcode, 4) as usize;
+
+    match op.op_id {
+        OPID_RESETZ => sim.resetz(qubit),
+        _ => panic!("unsupported reset op_id={}", op.op_id),
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Main interpreter entry point
+// ---------------------------------------------------------------------------
+
+pub fn run_shot<S: Simulator>(program: &AdaptiveProgram<u64>, sim: &mut S) {
+    const MAX_STEPS: u64 = 10_000_000;
+
+    let entry_pc = program.block_table[program.entry_block as usize].instr_offset;
+    let mut rt = Runtime::new(program.num_registers, program.entry_block, entry_pc);
+
+    for _ in 0..MAX_STEPS {
+        let instr = program.instructions[rt.pc as usize];
+        let op = instr.primary_opcode();
+        let subcode = instr.sub_opcode();
+        let flags = instr.opcode;
+
+        match op {
+            OP_NOP => rt.pc += 1,
+
+            OP_RET => {
+                rt.exit_code = rt.resolve_u64(instr.dst, flags, 2);
+                break;
+            }
+
+            OP_JUMP => {
+                rt.previous_block_id = rt.current_block_id;
+                rt.current_block_id = instr.dst;
+                rt.pc = block_pc(program, rt.current_block_id);
+            }
+
+            OP_BRANCH => {
+                let cond = rt.resolve_u64(instr.src0, flags, 0) != 0;
+                let next_block = if cond { instr.aux0 } else { instr.aux1 };
+                rt.previous_block_id = rt.current_block_id;
+                rt.current_block_id = next_block;
+                rt.pc = block_pc(program, rt.current_block_id);
+            }
+
+            OP_SWITCH => {
+                let val = rt.resolve_u64(instr.src0, flags, 0);
+                let default_block = instr.aux0;
+                let case_offset = instr.aux1 as usize;
+                let case_count = instr.aux2 as usize;
+                let mut target_block = default_block;
+                for i in 0..case_count {
+                    let entry = program.switch_cases[case_offset + i];
+                    if entry.case_val == val {
+                        target_block = entry.target_block;
+                        break;
+                    }
+                }
+                rt.previous_block_id = rt.current_block_id;
+                rt.current_block_id = target_block;
+                rt.pc = block_pc(program, rt.current_block_id);
+            }
+
+            OP_CALL => {
+                let func_id = instr.aux0 as usize;
+                let arg_count = instr.aux1 as usize;
+                let arg_offset = instr.aux2 as usize;
+                let func = program.function_table[func_id];
+
+                rt.call_stack.push(CallStackFrame {
+                    block_id: rt.current_block_id,
+                    return_pc: rt.pc + 1,
+                    return_reg: instr.dst,
+                });
+
+                let param_base = func.param_base_reg;
+                for i in 0..arg_count {
+                    let arg_reg = program.call_args[arg_offset + i];
+                    let val = rt.read_reg(arg_reg);
+                    rt.write_reg(param_base + i as u64, val);
+                }
+
+                rt.current_block_id = func.entry_block_id;
+                rt.pc = block_pc(program, rt.current_block_id);
+            }
+
+            OP_CALL_RETURN => {
+                let frame = rt.call_stack.pop().expect("call stack underflow");
+                let return_block = frame.block_id;
+                let return_pc = frame.return_pc;
+                let return_reg = frame.return_reg;
+
+                rt.current_block_id = return_block;
+                rt.pc = return_pc;
+                if return_reg != VOID_RETURN {
+                    let ret_val = rt.resolve_u64(instr.src0, flags, 0);
+                    rt.write_reg(return_reg, ret_val);
+                }
+            }
+
+            // ----- Quantum operations -----
+            OP_QUANTUM_GATE => {
+                dispatch_quantum_gate(program, sim, &instr, &rt);
+                rt.pc += 1;
+            }
+
+            OP_MEASURE => {
+                dispatch_measure(program, sim, &instr, &rt);
+                rt.pc += 1;
+            }
+
+            OP_RESET => {
+                dispatch_reset(program, sim, &instr, &rt);
+                rt.pc += 1;
+            }
+
+            OP_READ_RESULT => {
+                let result_id = rt.resolve_u64(instr.src0, flags, 0) as usize;
+                let measurements = sim.measurements();
+                let val = if result_id < measurements.len() {
+                    match measurements[result_id] {
+                        MeasurementResult::One => 1u64,
+                        _ => 0u64,
+                    }
+                } else {
+                    0u64
+                };
+                rt.write_reg(instr.dst, val);
+                rt.pc += 1;
+            }
+
+            OP_RECORD_OUTPUT => {
+                // No-op on CPU — results are read from the simulator directly.
+                rt.pc += 1;
+            }
+
+            // ----- Integer arithmetic -----
+            OP_ADD => {
+                let a = rt.resolve_i64(instr.src0, flags, 0);
+                let b = rt.resolve_i64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a.wrapping_add(b) as u64);
+                rt.pc += 1;
+            }
+
+            OP_SUB => {
+                let a = rt.resolve_i64(instr.src0, flags, 0);
+                let b = rt.resolve_i64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a.wrapping_sub(b) as u64);
+                rt.pc += 1;
+            }
+
+            OP_MUL => {
+                let a = rt.resolve_i64(instr.src0, flags, 0);
+                let b = rt.resolve_i64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a.wrapping_mul(b) as u64);
+                rt.pc += 1;
+            }
+
+            OP_UDIV => {
+                let a = rt.resolve_u64(instr.src0, flags, 0);
+                let b = rt.resolve_u64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a / b);
+                rt.pc += 1;
+            }
+
+            OP_SDIV => {
+                let a = rt.resolve_i64(instr.src0, flags, 0);
+                let b = rt.resolve_i64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a.wrapping_div(b) as u64);
+                rt.pc += 1;
+            }
+
+            OP_UREM => {
+                let a = rt.resolve_u64(instr.src0, flags, 0);
+                let b = rt.resolve_u64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a % b);
+                rt.pc += 1;
+            }
+
+            OP_SREM => {
+                let a = rt.resolve_i64(instr.src0, flags, 0);
+                let b = rt.resolve_i64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a.wrapping_rem(b) as u64);
+                rt.pc += 1;
+            }
+
+            // ----- Bitwise / shift -----
+            OP_AND => {
+                let a = rt.resolve_u64(instr.src0, flags, 0);
+                let b = rt.resolve_u64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a & b);
+                rt.pc += 1;
+            }
+
+            OP_OR => {
+                let a = rt.resolve_u64(instr.src0, flags, 0);
+                let b = rt.resolve_u64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a | b);
+                rt.pc += 1;
+            }
+
+            OP_XOR => {
+                let a = rt.resolve_u64(instr.src0, flags, 0);
+                let b = rt.resolve_u64(instr.src1, flags, 1);
+                rt.write_reg(instr.dst, a ^ b);
+                rt.pc += 1;
+            }
+
+            OP_SHL => {
+                let a = rt.resolve_u64(instr.src0, flags, 0);
+                let b = rt.resolve_u64(instr.src1, flags, 1) as u32;
+                rt.write_reg(instr.dst, a.wrapping_shl(b));
+                rt.pc += 1;
+            }
+
+            OP_LSHR => {
+                let a = rt.resolve_u64(instr.src0, flags, 0);
+                let b = rt.resolve_u64(instr.src1, flags, 1) as u32;
+                rt.write_reg(instr.dst, a.wrapping_shr(b));
+                rt.pc += 1;
+            }
+
+            OP_ASHR => {
+                let a = rt.resolve_i64(instr.src0, flags, 0);
+                let b = rt.resolve_u64(instr.src1, flags, 1) as u32;
+                rt.write_reg(instr.dst, a.wrapping_shr(b) as u64);
+                rt.pc += 1;
+            }
+
+            // ----- Integer comparison -----
+            OP_ICMP => {
+                let a = rt.resolve_i64(instr.src0, flags, 0);
+                let b = rt.resolve_i64(instr.src1, flags, 1);
+                let result = match subcode {
+                    ICMP_EQ => a == b,
+                    ICMP_NE => a != b,
+                    ICMP_SLT => a < b,
+                    ICMP_SLE => a <= b,
+                    ICMP_SGT => a > b,
+                    ICMP_SGE => a >= b,
+                    ICMP_ULT => (a as u64) < (b as u64),
+                    ICMP_ULE => (a as u64) <= (b as u64),
+                    ICMP_UGT => (a as u64) > (b as u64),
+                    ICMP_UGE => (a as u64) >= (b as u64),
+                    _ => panic!("unsupported icmp condition code {subcode}"),
+                };
+                rt.write_reg(instr.dst, u64::from(result));
+                rt.pc += 1;
+            }
+
+            // ----- Float comparison -----
+            OP_FCMP => {
+                let a = rt.resolve_f64(instr.src0, flags, 0);
+                let b = rt.resolve_f64(instr.src1, flags, 1);
+                let result = match subcode {
+                    FCMP_OEQ => a == b,
+                    FCMP_ONE => a != b,
+                    FCMP_OLT => a < b,
+                    FCMP_OLE => a <= b,
+                    FCMP_OGT => a > b,
+                    FCMP_OGE => a >= b,
+                    _ => panic!("unsupported fcmp condition code {subcode}"),
+                };
+                rt.write_reg(instr.dst, u64::from(result));
+                rt.pc += 1;
+            }
+
+            // ----- Float arithmetic -----
+            OP_FADD => {
+                let a = rt.resolve_f64(instr.src0, flags, 0);
+                let b = rt.resolve_f64(instr.src1, flags, 1);
+                rt.write_f64(instr.dst, a + b);
+                rt.pc += 1;
+            }
+
+            OP_FSUB => {
+                let a = rt.resolve_f64(instr.src0, flags, 0);
+                let b = rt.resolve_f64(instr.src1, flags, 1);
+                rt.write_f64(instr.dst, a - b);
+                rt.pc += 1;
+            }
+
+            OP_FMUL => {
+                let a = rt.resolve_f64(instr.src0, flags, 0);
+                let b = rt.resolve_f64(instr.src1, flags, 1);
+                rt.write_f64(instr.dst, a * b);
+                rt.pc += 1;
+            }
+
+            OP_FDIV => {
+                let a = rt.resolve_f64(instr.src0, flags, 0);
+                let b = rt.resolve_f64(instr.src1, flags, 1);
+                rt.write_f64(instr.dst, a / b);
+                rt.pc += 1;
+            }
+
+            // ----- Type conversion -----
+            OP_ZEXT => {
+                let val = rt.resolve_u64(instr.src0, flags, 0);
+                rt.write_reg(instr.dst, val);
+                rt.pc += 1;
+            }
+
+            OP_SEXT => {
+                let val = rt.resolve_i64(instr.src0, flags, 0);
+                let src_bits = instr.aux0 as u32;
+                let result = if src_bits > 0 && src_bits < 64 {
+                    let shift = 64 - src_bits;
+                    (val.wrapping_shl(shift)).wrapping_shr(shift)
+                } else {
+                    val
+                };
+                rt.write_reg(instr.dst, result as u64);
+                rt.pc += 1;
+            }
+
+            OP_TRUNC => {
+                let val = rt.resolve_u64(instr.src0, flags, 0);
+                rt.write_reg(instr.dst, val);
+                rt.pc += 1;
+            }
+
+            OP_FPEXT | OP_FPTRUNC => {
+                let val = rt.resolve_f64(instr.src0, flags, 0);
+                rt.write_f64(instr.dst, val);
+                rt.pc += 1;
+            }
+
+            OP_INTTOPTR => {
+                let val = rt.resolve_u64(instr.src0, flags, 0);
+                rt.write_reg(instr.dst, val);
+                rt.pc += 1;
+            }
+
+            OP_FPTOSI => {
+                let val = rt.resolve_f64(instr.src0, flags, 0);
+                rt.write_reg(instr.dst, val as i64 as u64);
+                rt.pc += 1;
+            }
+
+            OP_SITOFP => {
+                let val = rt.resolve_i64(instr.src0, flags, 0);
+                rt.write_f64(instr.dst, val as f64);
+                rt.pc += 1;
+            }
+
+            // ----- PHI node -----
+            OP_PHI => {
+                let offset = instr.aux0 as usize;
+                let count = instr.aux1 as usize;
+                for i in 0..count {
+                    let entry = program.phi_entries[offset + i];
+                    if entry.block_id == rt.previous_block_id {
+                        let val = rt.read_reg(entry.val_reg);
+                        rt.write_reg(instr.dst, val);
+                        break;
+                    }
+                }
+                rt.pc += 1;
+            }
+
+            // ----- Data movement -----
+            OP_SELECT => {
+                let cond = rt.resolve_u64(instr.src0, flags, 0) != 0;
+                let true_val = rt.resolve_u64(instr.aux0, flags, 3);
+                let false_val = rt.resolve_u64(instr.aux1, flags, 4);
+                rt.write_reg(instr.dst, if cond { true_val } else { false_val });
+                rt.pc += 1;
+            }
+
+            OP_MOV => {
+                let val = rt.resolve_u64(instr.src0, flags, 0);
+                rt.write_reg(instr.dst, val);
+                rt.pc += 1;
+            }
+
+            OP_CONST => {
+                rt.write_reg(instr.dst, instr.src0);
+                rt.pc += 1;
+            }
+
+            _ => panic!("unsupported opcode 0x{op:02X} at pc={}", rt.pc),
+        }
+    }
+}
+
+fn block_pc(program: &AdaptiveProgram<u64>, block_id: u64) -> u64 {
+    program.block_table[block_id as usize].instr_offset
+}

From 88cd287988497caca42c6f2a66cc47ad2b10f792 Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Wed, 1 Apr 2026 15:28:47 -0700
Subject: [PATCH 03/14] cargo fmt

---
 source/pip/src/interpreter.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/source/pip/src/interpreter.rs b/source/pip/src/interpreter.rs
index e488bf3b88..be4cae3c77 100644
--- a/source/pip/src/interpreter.rs
+++ b/source/pip/src/interpreter.rs
@@ -24,7 +24,9 @@ use crate::{
     noisy_simulator::register_noisy_simulator_submodule,
     qir_simulation::{
         IdleNoiseParams, NoiseConfig, NoiseTable, QirInstruction, QirInstructionId,
-        cpu_simulators::{run_clifford, run_clifford_adaptive, run_cpu_adaptive, run_cpu_full_state},
+        cpu_simulators::{
+            run_clifford, run_clifford_adaptive, run_cpu_adaptive, run_cpu_full_state,
+        },
         gpu_full_state::{
             GpuContext, run_adaptive_parallel_shots, run_parallel_shots, try_create_gpu_adapter,
         },

From b2634d3ce1abb186aeec7c8f4c6fad773ac0480a Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Thu, 2 Apr 2026 11:01:51 -0700
Subject: [PATCH 04/14] better python signatures

---
 source/pip/qsharp/_adaptive_pass.py           | 41 +++++++-------
 .../pip/tests/test_adaptive_cpu_bytecode.py   | 54 ++++++++++++++-----
 source/pip/tests/test_adaptive_cpu_noise.py   | 12 +++--
 .../tests/test_adaptive_cpu_quantum_ops.py    | 10 ++--
 4 files changed, 76 insertions(+), 41 deletions(-)

diff --git a/source/pip/qsharp/_adaptive_pass.py b/source/pip/qsharp/_adaptive_pass.py
index 1aca506831..143dc50bde 100644
--- a/source/pip/qsharp/_adaptive_pass.py
+++ b/source/pip/qsharp/_adaptive_pass.py
@@ -20,8 +20,8 @@
 
 
 class Bytecode(Enum):
-    Bit32 = 1
-    Bit64 = 2
+    Bit32 = 32
+    Bit64 = 64
 
 
 # ---------------------------------------------------------------------------
@@ -199,8 +199,8 @@ class SwitchCase:
 
 @dataclass
 class IntOperand:
-    val: int = 0
-    bits: int = 32
+    val: int
+    bits: int
 
     def __post_init__(self):
         # Mask to the appropriate word-width so negative Python ints become
@@ -289,6 +289,7 @@ def __init__(self, bytecode_kind: Bytecode):
 
         # Internal tracking.
         self._bytecode_kind = bytecode_kind
+        self._int_bits = bytecode_kind.value
         self._next_reg: int = 0
         self._next_block: int = 0
         self._next_qop: int = 0
@@ -297,7 +298,6 @@ def __init__(self, bytecode_kind: Bytecode):
         self._func_to_id: Dict[str, int] = {}  # function name → function ID
         self._current_func_is_entry: bool = True
         self._noise_intrinsics: Optional[Dict[str, int]] = None
-        self._int_bits = 32 if bytecode_kind == Bytecode.Bit32 else 64
 
     def run(
         self,
@@ -457,10 +457,10 @@ def _resolve_operand(self, value: pyqir.Value) -> IntOperand | FloatOperand | Re
             # Try extracting as a qubit/result pointer constant.
             qid = pyqir.qubit_id(value)
             if qid is not None:
-                return IntOperand(qid)
+                return IntOperand(qid, self._int_bits)
             rid = pyqir.result_id(value)
             if rid is not None:
-                return IntOperand(rid)
+                return IntOperand(rid, self._int_bits)
             # Null pointer
             if value.is_null:
                 reg = self._alloc_reg(value, REG_TYPE_PTR)
@@ -717,7 +717,11 @@ def _emit_call(self, call: pyqir.Call) -> None:
     def _resolve_qubit_operands(
         self, args: List[pyqir.Value]
     ) -> Tuple[IntOperand | Reg, IntOperand | Reg, IntOperand | Reg]:
-        qs: List[IntOperand | Reg] = [IntOperand(), IntOperand(), IntOperand()]
+        qs: List[IntOperand | Reg] = [
+            IntOperand(0, self._int_bits),
+            IntOperand(0, self._int_bits),
+            IntOperand(0, self._int_bits),
+        ]
         for i, arg in enumerate(args):
             qs[i] = self._resolve_qubit_operand(arg)
         return (qs[0], qs[1], qs[2])
@@ -813,8 +817,8 @@ def _emit_noise_intrinsic_call(self, call: pyqir.Call) -> None:
             self._emit(
                 OP_QUANTUM_GATE,
                 aux0=qop_idx,
-                aux1=IntOperand(qubit_count),
-                aux2=IntOperand(arg_offset),
+                aux1=IntOperand(qubit_count, self._int_bits),
+                aux2=IntOperand(arg_offset, self._int_bits),
             )
         elif self._noise_intrinsics is not None:
             raise ValueError(f"Missing noise intrinsic: {callee_name}")
@@ -877,19 +881,14 @@ def _emit_switch(self, switch_instr: pyqir.Switch) -> None:
         reference when ``mod.functions`` has already been iterated (two-pass
         compilation).  ``operands`` is not affected by this behavior.
         """
-        # operands layout: [cond, default_block, case_val0, case_block0, ...]
-        ops = switch_instr.operands
-        cond_reg = self._resolve_operand(ops[0])
-        default_block = self._block_to_id[ops[1]]
+        cond_reg = self._resolve_operand(switch_instr.operands[0])
+        default_block = self._block_to_id[switch_instr.default]
         case_offset = len(self.switch_cases)
-        num_case_pairs = (len(ops) - 2) // 2
-        for i in range(num_case_pairs):
-            case_val = ops[2 + 2 * i]
-            case_block = ops[2 + 2 * i + 1]
-            target_block = self._block_to_id[case_block]
+        for case_val, block in switch_instr.cases:
+            target_block = self._block_to_id[block]
             switch_case = SwitchCase(case_val.value, target_block)
             self.switch_cases.append(switch_case)
-        case_count = num_case_pairs
+        case_count = len(switch_instr.cases)
         self._emit(
             OP_SWITCH,
             src0=cond_reg,
@@ -914,7 +913,7 @@ def _emit_ret(self, instr: Any) -> None:
                 self._emit(OP_RET, dst=ret_reg)
             else:
                 # Void return — use immediate 0 as exit code.
-                self._emit(OP_RET, dst=IntOperand(0))
+                self._emit(OP_RET, dst=IntOperand(0, self._int_bits))
 
     # ------------------------------------------------------------------
     # Comparison emitters
diff --git a/source/pip/tests/test_adaptive_cpu_bytecode.py b/source/pip/tests/test_adaptive_cpu_bytecode.py
index 6de5a36da3..29656d8b26 100644
--- a/source/pip/tests/test_adaptive_cpu_bytecode.py
+++ b/source/pip/tests/test_adaptive_cpu_bytecode.py
@@ -15,9 +15,10 @@
 
 from collections import Counter
 import pytest
+from qsharp._simulation import run_qir, Result
 import qsharp.openqasm
+from typing import Literal
 
-from qsharp._simulation import run_qir, Result
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -26,7 +27,6 @@
 # Deterministic programs need a single shot but we run multiple shots
 # to verify that multiple shots yield the same result.
 SHOTS = 100
-
 SIM_TYPES = ["cpu", "clifford"]
 
 
@@ -43,7 +43,12 @@ def map_result_list_to_str(results):
     return results_str
 
 
-def _run(qir: str, shots: int = SHOTS, seed: int = 42, sim_type: str = "cpu"):
+def _run(
+    qir: str,
+    shots: int = SHOTS,
+    seed: int = 42,
+    sim_type: Literal["clifford", "cpu"] = "cpu",
+):
     """Run *qir* on the given simulator and return shot results as a list of strings."""
     results = run_qir(qir, shots, seed=seed, type=sim_type)
     return [map_result_list_to_str(r) for r in results]
@@ -57,7 +62,7 @@ def check_result(
     num_qubits: int = 1,
     num_results: int = 1,
     record=None,
-    sim_type: str = "cpu",
+    sim_type: Literal["clifford", "cpu"] = "cpu",
 ):
     """Assert every shot produces *expected*."""
     qir = format_qir(
@@ -74,7 +79,9 @@ def check_result(
     }, f"Expected all {SHOTS} shots to be '{expected}', got {counts}"
 
 
-def check_arith_result(qir_fragment: str, expected: str, sim_type: str = "cpu"):
+def check_arith_result(
+    qir_fragment: str, expected: str, sim_type: Literal["clifford", "cpu"] = "cpu"
+):
     body = build_arith_body(qir_fragment)
     check_result(body, expected, sim_type=sim_type)
 
@@ -458,7 +465,9 @@ def test_read_result(sim_type):
 @pytest.mark.parametrize("sim_type", SIM_TYPES)
 def test_record_output_ordering(sim_type):
     """Two results recorded: result0=1, result1=0 → '10'."""
-    check_result(RECORD_OUTPUT_QIR, "10", num_qubits=2, num_results=2, sim_type=sim_type)
+    check_result(
+        RECORD_OUTPUT_QIR, "10", num_qubits=2, num_results=2, sim_type=sim_type
+    )
 
 
 # #########################################################################
@@ -1004,7 +1013,9 @@ def test_const(sim_type):
 @pytest.mark.parametrize("sim_type", SIM_TYPES)
 def test_and_i1_boolean(sim_type):
     """Deterministic boolean AND: both qubits |1⟩ → and i1 true, true → X → 1."""
-    check_result(AND_I1_QIR, "1", num_qubits=2, num_results=3, record=[2], sim_type=sim_type)
+    check_result(
+        AND_I1_QIR, "1", num_qubits=2, num_results=3, record=[2], sim_type=sim_type
+    )
 
 
 # =========================================================================
@@ -1033,7 +1044,9 @@ def test_and_i1_boolean(sim_type):
 @pytest.mark.parametrize("sim_type", SIM_TYPES)
 def test_or_i1_boolean(sim_type):
     """Deterministic boolean OR: q0=1, q1=0 → or i1 true, false → true → X → 1."""
-    check_result(OR_I1_QIR, "1", num_qubits=2, num_results=3, record=[2], sim_type=sim_type)
+    check_result(
+        OR_I1_QIR, "1", num_qubits=2, num_results=3, record=[2], sim_type=sim_type
+    )
 
 
 # =========================================================================
@@ -1061,7 +1074,9 @@ def test_or_i1_boolean(sim_type):
 @pytest.mark.parametrize("sim_type", SIM_TYPES)
 def test_xor_i1_not(sim_type):
     """XOR i1 used as NOT: measure 0 → XOR true → true → X → 1."""
-    check_result(XOR_NOT_QIR, "1", num_qubits=1, num_results=2, record=[1], sim_type=sim_type)
+    check_result(
+        XOR_NOT_QIR, "1", num_qubits=1, num_results=2, record=[1], sim_type=sim_type
+    )
 
 
 # #########################################################################
@@ -1329,7 +1344,12 @@ def test_float_roundtrip(sim_type):
 @pytest.mark.parametrize("sim_type", SIM_TYPES)
 def test_call_with_return_value(sim_type):
     """Call a function returning i64, use result in comparison."""
-    check_result(CALL_WITH_RETVAL_QIR, "1", extra_decls=CALL_WITH_RETVAL_QIR_FN, sim_type=sim_type)
+    check_result(
+        CALL_WITH_RETVAL_QIR,
+        "1",
+        extra_decls=CALL_WITH_RETVAL_QIR_FN,
+        sim_type=sim_type,
+    )
 
 
 # =========================================================================
@@ -1495,7 +1515,12 @@ def test_sext_i1_runtime(sim_type):
 @pytest.mark.parametrize("sim_type", SIM_TYPES)
 def test_call_inttoptr_arg(sim_type):
     """Call a helper with an inttoptr constant expression argument."""
-    check_result(CALL_INTTOPTR_ARG_QIR, "1", extra_decls=CALL_INTTOPTR_ARG_QIR_FN, sim_type=sim_type)
+    check_result(
+        CALL_INTTOPTR_ARG_QIR,
+        "1",
+        extra_decls=CALL_INTTOPTR_ARG_QIR_FN,
+        sim_type=sim_type,
+    )
 
 
 # =========================================================================
@@ -1522,7 +1547,12 @@ def test_sitofp_negative(sim_type):
 # #########################################################################
 
 
-def _run_openqasm(qasm_src: str, shots: int = SHOTS, seed: int = 42, sim_type: str = "cpu"):
+def _run_openqasm(
+    qasm_src: str,
+    shots: int = SHOTS,
+    seed: int = 42,
+    sim_type: Literal["clifford", "cpu"] = "cpu",
+):
     """Compile OpenQASM source via the adaptive pass and run on the given simulator."""
     qir = qsharp.openqasm.compile(
         qasm_src,
diff --git a/source/pip/tests/test_adaptive_cpu_noise.py b/source/pip/tests/test_adaptive_cpu_noise.py
index 4aec68ccf2..df829528e2 100644
--- a/source/pip/tests/test_adaptive_cpu_noise.py
+++ b/source/pip/tests/test_adaptive_cpu_noise.py
@@ -13,16 +13,16 @@
 from collections import Counter
 from typing import Optional, List
 import pytest
+from qsharp._simulation import run_qir, NoiseConfig, Result
 import qsharp.openqasm
+from typing import Literal
 
-from qsharp._simulation import run_qir, NoiseConfig, Result
 
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 
 SHOTS = 100
-
 SIM_TYPES = ["cpu", "clifford"]
 
 
@@ -48,7 +48,7 @@ def get_histogram(
     noise: Optional[NoiseConfig] = None,
     record: Optional[List[int]] = None,
     shots=SHOTS,
-    sim_type: str = "cpu",
+    sim_type: Literal["clifford", "cpu"] = "cpu",
 ):
     qir = format_qir(
         qir_fragment,
@@ -72,7 +72,7 @@ def check_result(
     num_results: int = 1,
     noise: Optional[NoiseConfig] = None,
     record: Optional[List[int]] = None,
-    sim_type: str = "cpu",
+    sim_type: Literal["clifford", "cpu"] = "cpu",
 ):
     """Assert every shot produces *expected*."""
     counts = get_histogram(
@@ -210,7 +210,9 @@ def test_z_noise_on_h_i_h_yields_1(sim_type):
 def test_probabilistic_x_noise(sim_type):
     noise = NoiseConfig()
     noise.cx.ix = 0.5
-    counts = get_histogram(I_QIR, shots=1000, num_qubits=2, noise=noise, sim_type=sim_type)
+    counts = get_histogram(
+        I_QIR, shots=1000, num_qubits=2, noise=noise, sim_type=sim_type
+    )
 
     assert counts["0"] > 400, f"Expected ~500 '0' results, got {counts['0']}"
     assert counts["1"] > 400, f"Expected ~500 '1' results, got {counts['1']}"
diff --git a/source/pip/tests/test_adaptive_cpu_quantum_ops.py b/source/pip/tests/test_adaptive_cpu_quantum_ops.py
index db34a07c5d..1296b510d2 100644
--- a/source/pip/tests/test_adaptive_cpu_quantum_ops.py
+++ b/source/pip/tests/test_adaptive_cpu_quantum_ops.py
@@ -13,10 +13,9 @@
 """
 
 from collections import Counter
-
 import pytest
-
 from qsharp._simulation import run_qir, Result
+from typing import Literal
 
 SIM_TYPES = ["cpu", "clifford"]
 
@@ -39,7 +38,12 @@ def map_result_list_to_str(results):
     return results_str
 
 
-def _run(qir: str, shots: int, seed: int = 42, sim_type: str = "cpu"):
+def _run(
+    qir: str,
+    shots: int,
+    seed: int = 42,
+    sim_type: Literal["clifford", "cpu"] = "cpu",
+):
     """Run *qir* on the given simulator and return shot results as a list of strings."""
     results = run_qir(qir, shots, seed=seed, type=sim_type)
     return [map_result_list_to_str(r) for r in results]

From 37bb014bb7a18e2910bd8ebb492bcb14395803eb Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Thu, 2 Apr 2026 11:03:54 -0700
Subject: [PATCH 05/14] add back operands laoyout comment in _emit_switch

---
 source/pip/qsharp/_adaptive_pass.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/source/pip/qsharp/_adaptive_pass.py b/source/pip/qsharp/_adaptive_pass.py
index 143dc50bde..214764d8c9 100644
--- a/source/pip/qsharp/_adaptive_pass.py
+++ b/source/pip/qsharp/_adaptive_pass.py
@@ -881,6 +881,7 @@ def _emit_switch(self, switch_instr: pyqir.Switch) -> None:
         reference when ``mod.functions`` has already been iterated (two-pass
         compilation).  ``operands`` is not affected by this behavior.
         """
+        # operands layout: [cond, default_block, case_val0, case_block0, ...]
         cond_reg = self._resolve_operand(switch_instr.operands[0])
         default_block = self._block_to_id[switch_instr.default]
         case_offset = len(self.switch_cases)

From c5cc017e7dc04362b3e913f452390d08ef3ed956 Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Thu, 2 Apr 2026 11:10:10 -0700
Subject: [PATCH 06/14] remove `s`, `s_adj`, and `z` noise overriding

---
 .../pip/src/qir_simulation/cpu_simulators.rs  | 28 ++-----------------
 1 file changed, 2 insertions(+), 26 deletions(-)

diff --git a/source/pip/src/qir_simulation/cpu_simulators.rs b/source/pip/src/qir_simulation/cpu_simulators.rs
index dbcc663ebf..de5662bd97 100644
--- a/source/pip/src/qir_simulation/cpu_simulators.rs
+++ b/source/pip/src/qir_simulation/cpu_simulators.rs
@@ -155,7 +155,7 @@ fn run<SimulatorBuilder, Noise, S>(
     num_results: u32,
     shots: u32,
     seed: Option<u32>,
-    mut noise: noise_config::NoiseConfig<f64, f64>,
+    noise: noise_config::NoiseConfig<f64, f64>,
     make_simulator: SimulatorBuilder,
 ) -> Vec<String>
 where
@@ -164,18 +164,6 @@ where
     Noise: From<noise_config::NoiseConfig<f64, f64>> + Send + Sync,
     S: Simulator,
 {
-    if !noise.rz.is_noiseless() {
-        if noise.s.is_noiseless() {
-            noise.s = noise.rz.clone();
-        }
-        if noise.z.is_noiseless() {
-            noise.z = noise.rz.clone();
-        }
-        if noise.s_adj.is_noiseless() {
-            noise.s_adj = noise.rz.clone();
-        }
-    }
-
     let noise: Noise = noise.into();
     let noise = Arc::new(noise);
 
@@ -366,7 +354,7 @@ fn run_adaptive<SimulatorBuilder, Noise, S>(
     program: &bytecode::AdaptiveProgram<u64>,
     shots: u32,
     seed: Option<u32>,
-    mut noise: noise_config::NoiseConfig<f64, f64>,
+    noise: noise_config::NoiseConfig<f64, f64>,
     make_simulator: SimulatorBuilder,
 ) -> Vec<String>
 where
@@ -374,18 +362,6 @@ where
     Noise: From<noise_config::NoiseConfig<f64, f64>> + Send + Sync,
     S: Simulator,
 {
-    if !noise.rz.is_noiseless() {
-        if noise.s.is_noiseless() {
-            noise.s = noise.rz.clone();
-        }
-        if noise.z.is_noiseless() {
-            noise.z = noise.rz.clone();
-        }
-        if noise.s_adj.is_noiseless() {
-            noise.s_adj = noise.rz.clone();
-        }
-    }
-
     let noise: Noise = noise.into();
     let noise = Arc::new(noise);
 

From 20007aa4673d909b38d2544d2e83bb6f3b22ab3f Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Thu, 2 Apr 2026 11:25:00 -0700
Subject: [PATCH 07/14] make `VOID_RETURN` sentil value depend on bytecode word
 width

---
 source/pip/qsharp/_adaptive_bytecode.py   | 3 ---
 source/pip/qsharp/_adaptive_pass.py       | 9 ++++++++-
 source/simulators/src/bytecode/runtime.rs | 2 +-
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/source/pip/qsharp/_adaptive_bytecode.py b/source/pip/qsharp/_adaptive_bytecode.py
index 876a0a196f..b86b8e2cda 100644
--- a/source/pip/qsharp/_adaptive_bytecode.py
+++ b/source/pip/qsharp/_adaptive_bytecode.py
@@ -127,6 +127,3 @@
 REG_TYPE_F32 = 3
 REG_TYPE_F64 = 4
 REG_TYPE_PTR = 5
-
-# ── Sentinel values ──────────────────────────────────────────────────────────
-VOID_RETURN = 0xFFFFFFFF  # Function does not have a return value.
diff --git a/source/pip/qsharp/_adaptive_pass.py b/source/pip/qsharp/_adaptive_pass.py
index 214764d8c9..504c3f5808 100644
--- a/source/pip/qsharp/_adaptive_pass.py
+++ b/source/pip/qsharp/_adaptive_pass.py
@@ -272,6 +272,13 @@ def encode_float_as_bits(val: float, bytecode_kind: Bytecode) -> int:
         return struct.unpack("<Q", struct.pack("<d", val))[0]
 
 
+def void_return(bytecode_kind: Bytecode):
+    if bytecode_kind == Bytecode.Bit32:
+        return 0xFFFF_FFFF
+    else:
+        return 0xFFFF_FFFF_FFFF_FFFF
+
+
 class AdaptiveProfilePass:
     """Walks Adaptive Profile QIR and emits the intermediate format for Rust."""
 
@@ -978,7 +985,7 @@ def _emit_ir_function_call(self, call: Any) -> None:
                 self.call_args.append(reg.val)
         # Allocate return register if function has non-void return type
         if call.type.is_void:
-            return_reg = VOID_RETURN  # no return
+            return_reg = void_return(self._bytecode_kind)  # no return
         else:
             return_reg = self._alloc_reg(call, REG_TYPE_I32)
         self._emit(
diff --git a/source/simulators/src/bytecode/runtime.rs b/source/simulators/src/bytecode/runtime.rs
index db0d0e6894..ef66368d36 100644
--- a/source/simulators/src/bytecode/runtime.rs
+++ b/source/simulators/src/bytecode/runtime.rs
@@ -144,7 +144,7 @@ const OPID_CY: u64 = 29;
 const OPID_CORRELATED_NOISE: u64 = 131;
 
 // Sentinel
-const VOID_RETURN: u64 = 0xFFFF_FFFF;
+const VOID_RETURN: u64 = 0xFFFF_FFFF_FFFF_FFFF;
 
 // ---------------------------------------------------------------------------
 // Runtime state

From 11eac429e0ef757bd61fb0e769548a2fb7d398b2 Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Thu, 2 Apr 2026 11:52:43 -0700
Subject: [PATCH 08/14] cleanup runtime.rs

---
 .../pip/tests/test_adaptive_cpu_bytecode.py   |   2 +-
 .../pip/tests/test_adaptive_gpu_bytecode.py   |   2 +-
 source/simulators/src/bytecode/runtime.rs     | 120 +++++++++---------
 3 files changed, 60 insertions(+), 64 deletions(-)

diff --git a/source/pip/tests/test_adaptive_cpu_bytecode.py b/source/pip/tests/test_adaptive_cpu_bytecode.py
index 29656d8b26..9118b27291 100644
--- a/source/pip/tests/test_adaptive_cpu_bytecode.py
+++ b/source/pip/tests/test_adaptive_cpu_bytecode.py
@@ -800,7 +800,7 @@ def test_zext(sim_type):
 SEXT_QIR = """
   ; sext i1 true to i64 → -1 (all ones), check -1 < 0 → true
   %s = sext i1 true to i64
-  %flag = icmp slt i64 %s, 0
+  %flag = icmp eq i64 %s, -1
 """
 
 
diff --git a/source/pip/tests/test_adaptive_gpu_bytecode.py b/source/pip/tests/test_adaptive_gpu_bytecode.py
index 8cd2dcb93a..e7e1335e66 100644
--- a/source/pip/tests/test_adaptive_gpu_bytecode.py
+++ b/source/pip/tests/test_adaptive_gpu_bytecode.py
@@ -796,7 +796,7 @@ def test_zext():
 SEXT_QIR = """
   ; sext i1 true to i64 → -1 (all ones), check -1 < 0 → true
   %s = sext i1 true to i64
-  %flag = icmp slt i64 %s, 0
+  %flag = icmp eq i64 %s, -1
 """
 
 
diff --git a/source/simulators/src/bytecode/runtime.rs b/source/simulators/src/bytecode/runtime.rs
index ef66368d36..02a9ff420c 100644
--- a/source/simulators/src/bytecode/runtime.rs
+++ b/source/simulators/src/bytecode/runtime.rs
@@ -11,11 +11,7 @@
     clippy::cast_possible_truncation,
     clippy::cast_possible_wrap,
     clippy::cast_precision_loss,
-    clippy::cast_sign_loss,
-    clippy::float_cmp,
-    clippy::match_same_arms,
-    clippy::single_match_else,
-    clippy::too_many_lines
+    clippy::cast_sign_loss
 )]
 
 use crate::{
@@ -230,44 +226,41 @@ fn dispatch_quantum_gate<S: Simulator>(
     let op = &program.quantum_ops[op_idx];
     let op_id = op.op_id;
 
-    match op_id {
-        OPID_CORRELATED_NOISE => {
-            let qubit_count = rt.resolve_u64(instr.aux1, instr.opcode, 4) as usize;
-            let arg_offset = rt.resolve_u64(instr.aux2, instr.opcode, 5) as usize;
-            let table_id = op.q1 as u32;
-            let targets: Vec<usize> = (0..qubit_count)
-                .map(|i| rt.read_reg(program.call_args[arg_offset + i]) as usize)
-                .collect();
-            sim.correlated_noise_intrinsic(table_id, &targets);
-        }
-        _ => {
-            let q1 = rt.resolve_u64(instr.aux1, instr.opcode, 4) as usize;
-            let q2 = rt.resolve_u64(instr.aux2, instr.opcode, 5) as usize;
-            let angle = op.angle;
-            match op_id {
-                OPID_X => sim.x(q1),
-                OPID_Y => sim.y(q1),
-                OPID_Z => sim.z(q1),
-                OPID_H => sim.h(q1),
-                OPID_S => sim.s(q1),
-                OPID_S_ADJ => sim.s_adj(q1),
-                OPID_T => sim.t(q1),
-                OPID_T_ADJ => sim.t_adj(q1),
-                OPID_SX => sim.sx(q1),
-                OPID_SX_ADJ => sim.sx_adj(q1),
-                OPID_RX => sim.rx(angle, q1),
-                OPID_RY => sim.ry(angle, q1),
-                OPID_RZ => sim.rz(angle, q1),
-                OPID_CX => sim.cx(q1, q2),
-                OPID_CY => sim.cy(q1, q2),
-                OPID_CZ => sim.cz(q1, q2),
-                OPID_RXX => sim.rxx(angle, q1, q2),
-                OPID_RYY => sim.ryy(angle, q1, q2),
-                OPID_RZZ => sim.rzz(angle, q1, q2),
-                OPID_SWAP => sim.swap(q1, q2),
-                OPID_MOVE => sim.mov(q1),
-                _ => panic!("unsupported quantum gate op_id={op_id}"),
-            }
+    if op_id == OPID_CORRELATED_NOISE {
+        let qubit_count = rt.resolve_u64(instr.aux1, instr.opcode, 4) as usize;
+        let arg_offset = rt.resolve_u64(instr.aux2, instr.opcode, 5) as usize;
+        let table_id = op.q1 as u32;
+        let targets: Vec<usize> = (0..qubit_count)
+            .map(|i| rt.read_reg(program.call_args[arg_offset + i]) as usize)
+            .collect();
+        sim.correlated_noise_intrinsic(table_id, &targets);
+    } else {
+        let q1 = rt.resolve_u64(instr.aux1, instr.opcode, 4) as usize;
+        let q2 = rt.resolve_u64(instr.aux2, instr.opcode, 5) as usize;
+        let angle = op.angle;
+        match op_id {
+            OPID_X => sim.x(q1),
+            OPID_Y => sim.y(q1),
+            OPID_Z => sim.z(q1),
+            OPID_H => sim.h(q1),
+            OPID_S => sim.s(q1),
+            OPID_S_ADJ => sim.s_adj(q1),
+            OPID_T => sim.t(q1),
+            OPID_T_ADJ => sim.t_adj(q1),
+            OPID_SX => sim.sx(q1),
+            OPID_SX_ADJ => sim.sx_adj(q1),
+            OPID_RX => sim.rx(angle, q1),
+            OPID_RY => sim.ry(angle, q1),
+            OPID_RZ => sim.rz(angle, q1),
+            OPID_CX => sim.cx(q1, q2),
+            OPID_CY => sim.cy(q1, q2),
+            OPID_CZ => sim.cz(q1, q2),
+            OPID_RXX => sim.rxx(angle, q1, q2),
+            OPID_RYY => sim.ryy(angle, q1, q2),
+            OPID_RZZ => sim.rzz(angle, q1, q2),
+            OPID_SWAP => sim.swap(q1, q2),
+            OPID_MOVE => sim.mov(q1),
+            _ => panic!("unsupported quantum gate op_id={op_id}"),
         }
     }
 }
@@ -310,6 +303,7 @@ fn dispatch_reset<S: Simulator>(
 // Main interpreter entry point
 // ---------------------------------------------------------------------------
 
+#[allow(clippy::too_many_lines)]
 pub fn run_shot<S: Simulator>(program: &AdaptiveProgram<u64>, sim: &mut S) {
     const MAX_STEPS: u64 = 10_000_000;
 
@@ -553,6 +547,7 @@ pub fn run_shot<S: Simulator>(program: &AdaptiveProgram<u64>, sim: &mut S) {
             OP_FCMP => {
                 let a = rt.resolve_f64(instr.src0, flags, 0);
                 let b = rt.resolve_f64(instr.src1, flags, 1);
+                #[allow(clippy::float_cmp)]
                 let result = match subcode {
                     FCMP_OEQ => a == b,
                     FCMP_ONE => a != b,
@@ -596,13 +591,25 @@ pub fn run_shot<S: Simulator>(program: &AdaptiveProgram<u64>, sim: &mut S) {
             }
 
             // ----- Type conversion -----
-            OP_ZEXT => {
+            OP_ZEXT | OP_TRUNC | OP_INTTOPTR => {
                 let val = rt.resolve_u64(instr.src0, flags, 0);
                 rt.write_reg(instr.dst, val);
                 rt.pc += 1;
             }
 
             OP_SEXT => {
+                // Sign-extend a narrower integer (src_bits wide) to a full i64.
+                //
+                // Uses the shift-left-then-arithmetic-shift-right trick:
+                //   1. Shift left by (64 - src_bits) to move the narrow sign bit
+                //      into bit 63 (the i64 sign position).
+                //   2. Arithmetic shift right by the same amount to replicate the
+                //      sign bit across all upper bits.
+                //
+                // Example: sext i1 true (value 1, src_bits=1)
+                //   shift = 63
+                //   1 << 63 = 0x8000..0  (sign bit set)
+                //   >> 63   = 0xFFFF..F  (-1 as i64)
                 let val = rt.resolve_i64(instr.src0, flags, 0);
                 let src_bits = instr.aux0 as u32;
                 let result = if src_bits > 0 && src_bits < 64 {
@@ -615,24 +622,12 @@ pub fn run_shot<S: Simulator>(program: &AdaptiveProgram<u64>, sim: &mut S) {
                 rt.pc += 1;
             }
 
-            OP_TRUNC => {
-                let val = rt.resolve_u64(instr.src0, flags, 0);
-                rt.write_reg(instr.dst, val);
-                rt.pc += 1;
-            }
-
             OP_FPEXT | OP_FPTRUNC => {
                 let val = rt.resolve_f64(instr.src0, flags, 0);
                 rt.write_f64(instr.dst, val);
                 rt.pc += 1;
             }
 
-            OP_INTTOPTR => {
-                let val = rt.resolve_u64(instr.src0, flags, 0);
-                rt.write_reg(instr.dst, val);
-                rt.pc += 1;
-            }
-
             OP_FPTOSI => {
                 let val = rt.resolve_f64(instr.src0, flags, 0);
                 rt.write_reg(instr.dst, val as i64 as u64);
@@ -661,6 +656,13 @@ pub fn run_shot<S: Simulator>(program: &AdaptiveProgram<u64>, sim: &mut S) {
             }
 
             // ----- Data movement -----
+            #[allow(clippy::match_same_arms)]
+            OP_MOV => {
+                let val = rt.resolve_u64(instr.src0, flags, 0);
+                rt.write_reg(instr.dst, val);
+                rt.pc += 1;
+            }
+
             OP_SELECT => {
                 let cond = rt.resolve_u64(instr.src0, flags, 0) != 0;
                 let true_val = rt.resolve_u64(instr.aux0, flags, 3);
@@ -669,12 +671,6 @@ pub fn run_shot<S: Simulator>(program: &AdaptiveProgram<u64>, sim: &mut S) {
                 rt.pc += 1;
             }
 
-            OP_MOV => {
-                let val = rt.resolve_u64(instr.src0, flags, 0);
-                rt.write_reg(instr.dst, val);
-                rt.pc += 1;
-            }
-
             OP_CONST => {
                 rt.write_reg(instr.dst, instr.src0);
                 rt.pc += 1;

From 78134d3ef65511d92bf4a795dcf70f6a06508044 Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Thu, 2 Apr 2026 15:39:38 -0700
Subject: [PATCH 09/14] move `z`, `s`, `s_adj` inherit-noise-from-rz logic to
 device specific code

---
 source/pip/qsharp/_device/_atom/__init__.py   | 19 ++++++++++
 source/pip/qsharp/_native.pyi                 |  5 +++
 source/pip/src/qir_simulation.rs              | 10 ++++--
 .../devices/test_atom_e2e.py                  | 35 +++++++++++++++++++
 source/pip/tests/test_clifford_simulator.py   | 32 -----------------
 source/pip/tests/test_noisy_config.py         |  9 +++--
 6 files changed, 73 insertions(+), 37 deletions(-)

diff --git a/source/pip/qsharp/_device/_atom/__init__.py b/source/pip/qsharp/_device/_atom/__init__.py
index 6a8b0ae80c..2fb6d16bd4 100644
--- a/source/pip/qsharp/_device/_atom/__init__.py
+++ b/source/pip/qsharp/_device/_atom/__init__.py
@@ -258,6 +258,25 @@ def simulate(
         if noise is None:
             noise = NoiseConfig()
 
+        # Override s, s_adj, and z noise if they are unset
+        # and rz noise is set.
+        if noise and not noise.rz.is_noiseless():
+            if noise.s.is_noiseless():
+                noise.s.x = noise.rz.x
+                noise.s.y = noise.rz.y
+                noise.s.z = noise.rz.z
+                noise.s.loss = noise.rz.loss
+            if noise.s_adj.is_noiseless():
+                noise.s_adj.x = noise.rz.x
+                noise.s_adj.y = noise.rz.y
+                noise.s_adj.z = noise.rz.z
+                noise.s_adj.loss = noise.rz.loss
+            if noise.z.is_noiseless():
+                noise.z.x = noise.rz.x
+                noise.z.y = noise.rz.y
+                noise.z.z = noise.rz.z
+                noise.z.loss = noise.rz.loss
+
         compiled = self.compile(qir)
         module = Module.from_ir(Context(), str(compiled))
         ValidateNoConditionalBranches().run(module)
diff --git a/source/pip/qsharp/_native.pyi b/source/pip/qsharp/_native.pyi
index 2d059e83ad..0dc20ae804 100644
--- a/source/pip/qsharp/_native.pyi
+++ b/source/pip/qsharp/_native.pyi
@@ -896,6 +896,11 @@ class NoiseTable:
         The phase flip noise to use in simulation.
         """
 
+    def is_noiseless(self) -> bool:
+        """
+        Returns `true` if there is no noise set.
+        """
+
 class NoiseIntrinsicsTable:
     def __contains__(self, name: str) -> bool:
         """
diff --git a/source/pip/src/qir_simulation.rs b/source/pip/src/qir_simulation.rs
index 27f360c92c..28be031f63 100644
--- a/source/pip/src/qir_simulation.rs
+++ b/source/pip/src/qir_simulation.rs
@@ -418,9 +418,9 @@ impl NoiseTable {
         if let Some(p) = self.pauli_noise.get(&key) {
             return Ok(*p);
         }
-        Err(PyAttributeError::new_err(format!(
-            "'NoiseTable' object has no attribute '{pauli}'",
-        )))
+        // If pauli string is valid but is not in the noise table
+        // it means it has not been set. Just return 0 in this case.
+        Ok(0.0)
     }
 
     /// Set the probability of noise for an element on the [`NoiseTable`]
@@ -610,6 +610,10 @@ or one argument of type 'list[tuple[str, float]]', but found {py_args:?}"
     pub fn set_phaseflip(&mut self, value: Probability) -> PyResult<()> {
         self.set_pauli_noise_elt("Z", value)
     }
+
+    pub fn is_noiseless(&self) -> PyResult<bool> {
+        Ok(self.pauli_noise.is_empty() && self.loss == 0.0)
+    }
 }
 
 impl<T: Float> From<NoiseTable> for qdk_simulators::noise_config::NoiseTable<T> {
diff --git a/source/pip/tests-integration/devices/test_atom_e2e.py b/source/pip/tests-integration/devices/test_atom_e2e.py
index 753f93d784..258f17873a 100644
--- a/source/pip/tests-integration/devices/test_atom_e2e.py
+++ b/source/pip/tests-integration/devices/test_atom_e2e.py
@@ -158,3 +158,38 @@ def test_device_simulate_with_loss() -> None:
 
     assert result == [[qsharp.Result.Loss, qsharp.Result.Loss]]
     assert result2 == [[qsharp.Result.Loss, qsharp.Result.Loss]]
+
+
+def test_s_noise_inherits_from_rz():
+    qsharp.init(target_profile=qsharp.TargetProfile.Base)
+    qsharp.eval("operation Main() : Result { use q = Qubit(); S(q); MResetZ(q) }")
+    ir = qsharp.compile("Main()")
+    noise = NoiseConfig()
+    noise.rz.x = 1.0
+    device = NeutralAtomDevice()
+    output = device.simulate(ir, 1, noise)
+    assert output == [qsharp.Result.One]
+
+
+def test_z_noise_inherits_from_rz():
+    qsharp.init(target_profile=qsharp.TargetProfile.Base)
+    qsharp.eval("operation Main() : Result { use q = Qubit(); Z(q); MResetZ(q) }")
+    ir = qsharp.compile("Main()")
+    noise = NoiseConfig()
+    noise.rz.x = 1.0
+    device = NeutralAtomDevice()
+    output = device.simulate(ir, 1, noise)
+    assert output == [qsharp.Result.One]
+
+
+def test_s_adj_noise_inherits_from_rz():
+    qsharp.init(target_profile=qsharp.TargetProfile.Base)
+    qsharp.eval(
+        "operation Main() : Result { use q = Qubit(); Adjoint S(q); MResetZ(q) }"
+    )
+    ir = qsharp.compile("Main()")
+    noise = NoiseConfig()
+    noise.rz.x = 1.0
+    device = NeutralAtomDevice()
+    output = device.simulate(ir, 1, noise)
+    assert output == [qsharp.Result.One]
diff --git a/source/pip/tests/test_clifford_simulator.py b/source/pip/tests/test_clifford_simulator.py
index 76acfd934e..40a1ddaffa 100644
--- a/source/pip/tests/test_clifford_simulator.py
+++ b/source/pip/tests/test_clifford_simulator.py
@@ -69,38 +69,6 @@ def test_million():
     print(output)
 
 
-def test_s_noise_inherits_from_rz():
-    qsharp.init(target_profile=TargetProfile.Base)
-    qsharp.eval("operation Main() : Result { use q = Qubit(); S(q); MResetZ(q) }")
-    ir = qsharp.compile("Main()")
-    noise = NoiseConfig()
-    noise.rz.x = 1.0
-    output = run_qir_clifford(str(ir), 1, noise)
-    assert output == [Result.One]
-
-
-def test_z_noise_inherits_from_rz():
-    qsharp.init(target_profile=TargetProfile.Base)
-    qsharp.eval("operation Main() : Result { use q = Qubit(); Z(q); MResetZ(q) }")
-    ir = qsharp.compile("Main()")
-    noise = NoiseConfig()
-    noise.rz.x = 1.0
-    output = run_qir_clifford(str(ir), 1, noise)
-    assert output == [Result.One]
-
-
-def test_s_adj_noise_inherits_from_rz():
-    qsharp.init(target_profile=TargetProfile.Base)
-    qsharp.eval(
-        "operation Main() : Result { use q = Qubit(); Adjoint S(q); MResetZ(q) }"
-    )
-    ir = qsharp.compile("Main()")
-    noise = NoiseConfig()
-    noise.rz.x = 1.0
-    output = run_qir_clifford(str(ir), 1, noise)
-    assert output == [Result.One]
-
-
 def test_program_with_branching_succeeds():
     qsharp.init(target_profile=TargetProfile.Adaptive_RI)
     qsharp.eval(
diff --git a/source/pip/tests/test_noisy_config.py b/source/pip/tests/test_noisy_config.py
index 7bb46eadea..1042b21f31 100644
--- a/source/pip/tests/test_noisy_config.py
+++ b/source/pip/tests/test_noisy_config.py
@@ -5,6 +5,11 @@
 import pytest
 
 
+def test_accessing_unset_valid_pauli():
+    noise = NoiseConfig()
+    assert noise.h.x == 0
+
+
 def test_setting_1q_noise():
     noise = NoiseConfig()
     noise.h.set_pauli_noise("X", 0.01)
@@ -96,10 +101,10 @@ def test_setting_non_valid_pauli_through_attr_errors():
         noise.h.w = 0.01
 
 
-def test_accessing_non_set_pauli_attr_errors():
+def test_accessing_invalid_pauli_attr_errors():
     noise = NoiseConfig()
     with pytest.raises(AttributeError):
-        noise.h.x
+        noise.h.w
 
 
 def test_accessing_non_valid_pauli_attr_errors():

From 4c32c7275f64ca0f53bdb5fa2199e980e04ecffc Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Tue, 7 Apr 2026 13:05:33 -0700
Subject: [PATCH 10/14] fix dynamic angles

---
 source/pip/qsharp/_adaptive_pass.py           |   5 +-
 source/pip/qsharp/_native.pyi                 |   2 +-
 source/pip/qsharp/_simulation.py              |  67 +++----
 source/pip/src/qir_simulation.rs              |   2 +-
 .../pip/src/qir_simulation/gpu_full_state.rs  |   2 +-
 .../tests/test_adaptive_cpu_quantum_ops.py    |  60 ++++++
 .../tests/test_adaptive_gpu_quantum_ops.py    | 174 ++++++++++--------
 source/simulators/src/bytecode.rs             |   4 +-
 source/simulators/src/bytecode/runtime.rs     |   2 +-
 .../src/gpu_full_state_simulator/common.wgsl  |   2 +
 .../gpu_full_state_simulator/shader_types.rs  |   2 +-
 .../simulator_adaptive.wgsl                   |  72 ++++++++
 12 files changed, 265 insertions(+), 129 deletions(-)

diff --git a/source/pip/qsharp/_adaptive_pass.py b/source/pip/qsharp/_adaptive_pass.py
index 504c3f5808..eac4e14f33 100644
--- a/source/pip/qsharp/_adaptive_pass.py
+++ b/source/pip/qsharp/_adaptive_pass.py
@@ -167,7 +167,7 @@ class QuantumOp:
     q1: int
     q2: int
     q3: int
-    angle: float
+    angle: int
 
 
 @dataclass
@@ -423,7 +423,7 @@ def _emit_quantum_op(
         q1: int = 0,
         q2: int = 0,
         q3: int = 0,
-        angle: float = 0.0,
+        angle: int = 0,
     ) -> int:
         idx = self._next_qop
         self._next_qop += 1
@@ -784,6 +784,7 @@ def _emit_quantum_call(self, call: pyqir.Call) -> None:
         qop_idx = self._emit_quantum_op(op_id, q1.val, q2.val, q3.val, angle.val)
         self._emit(
             OP_QUANTUM_GATE,
+            src0=angle,
             aux0=qop_idx,
             aux1=q1,
             aux2=q2,
diff --git a/source/pip/qsharp/_native.pyi b/source/pip/qsharp/_native.pyi
index 0dc20ae804..e52158aabd 100644
--- a/source/pip/qsharp/_native.pyi
+++ b/source/pip/qsharp/_native.pyi
@@ -1064,9 +1064,9 @@ def try_create_gpu_adapter() -> str:
 
 def run_parallel_shots(
     input: List[QirInstruction],
-    shots: int,
     qubit_count: int,
     result_count: int,
+    shots: int,
     noise: Optional[NoiseConfig],
     seed: Optional[int],
 ) -> List[str]:
diff --git a/source/pip/qsharp/_simulation.py b/source/pip/qsharp/_simulation.py
index 7a3aae40af..b4df87f577 100644
--- a/source/pip/qsharp/_simulation.py
+++ b/source/pip/qsharp/_simulation.py
@@ -504,6 +504,30 @@ def str_to_result(result: str):
             raise ValueError(f"Invalid result {result}")
 
 
+def run_base(
+    rust_run_base_fn: Callable,
+    mod: pyqir.Module,
+    shots: int,
+    noise: Optional[NoiseConfig],
+    seed: int,
+):
+    """
+    Runs a base profile program given a rust simulator. Adds output recording logic.
+    """
+    if noise is None:
+        (gates, num_qubits, num_results) = AggregateGatesPass().run(mod)
+    else:
+        (gates, num_qubits, num_results) = CorrelatedNoisePass(noise).run(mod)
+    recorder = OutputRecordingPass()
+    recorder.run(mod)
+    return list(
+        map(
+            recorder.process_output,
+            rust_run_base_fn(gates, num_qubits, num_results, shots, noise, seed),
+        )
+    )
+
+
 def run_adaptive(
     rust_run_adaptive_fn: Callable,
     program: AdaptiveProgram,
@@ -512,7 +536,7 @@ def run_adaptive(
     seed: int,
 ):
     """
-    Runs an adaptive program given a rust simulator. Adds output recording logic.
+    Runs an adaptive profile program given a rust simulator. Adds output recording logic.
     """
     results = rust_run_adaptive_fn(program.as_dict(), shots, noise, seed)
     # Extract recorded output result indices from the bytecode.
@@ -540,19 +564,7 @@ def run_qir_clifford(
         program = AdaptiveProfilePass(Bytecode.Bit64).run(mod, noise)
         return run_adaptive(run_clifford_adaptive, program, shots, noise, seed)
     else:
-        if noise is None:
-            (gates, num_qubits, num_results) = AggregateGatesPass().run(mod)
-        else:
-            (gates, num_qubits, num_results) = CorrelatedNoisePass(noise).run(mod)
-        recorder = OutputRecordingPass()
-        recorder.run(mod)
-
-        return list(
-            map(
-                recorder.process_output,
-                run_clifford(gates, num_qubits, num_results, shots, noise, seed),
-            )
-        )
+        return run_base(run_clifford, mod, shots, noise, seed)
 
 
 def run_qir_cpu(
@@ -566,19 +578,7 @@ def run_qir_cpu(
         program = AdaptiveProfilePass(Bytecode.Bit64).run(mod, noise)
         return run_adaptive(run_cpu_adaptive, program, shots, noise, seed)
     else:
-        if noise is None:
-            (gates, num_qubits, num_results) = AggregateGatesPass().run(mod)
-        else:
-            (gates, num_qubits, num_results) = CorrelatedNoisePass(noise).run(mod)
-        recorder = OutputRecordingPass()
-        recorder.run(mod)
-
-        return list(
-            map(
-                recorder.process_output,
-                run_cpu_full_state(gates, num_qubits, num_results, shots, noise, seed),
-            )
-        )
+        return run_base(run_cpu_full_state, mod, shots, noise, seed)
 
 
 def run_qir_gpu(
@@ -594,18 +594,7 @@ def run_qir_gpu(
         program = AdaptiveProfilePass(Bytecode.Bit32).run(mod, noise)
         return run_adaptive(run_adaptive_parallel_shots, program, shots, noise, seed)
     else:
-        if noise is None:
-            (gates, num_qubits, num_results) = AggregateGatesPass().run(mod)
-        else:
-            (gates, num_qubits, num_results) = CorrelatedNoisePass(noise).run(mod)
-        recorder = OutputRecordingPass()
-        recorder.run(mod)
-        return list(
-            map(
-                recorder.process_output,
-                run_parallel_shots(gates, shots, num_qubits, num_results, noise, seed),
-            )
-        )
+        return run_base(run_parallel_shots, mod, shots, noise, seed)
 
 
 def prepare_qir_with_correlated_noise(
diff --git a/source/pip/src/qir_simulation.rs b/source/pip/src/qir_simulation.rs
index 28be031f63..555d97798d 100644
--- a/source/pip/src/qir_simulation.rs
+++ b/source/pip/src/qir_simulation.rs
@@ -776,7 +776,7 @@ where
 
     type BlockTuple<W> = (W, W, W);
     type InsTuple<W> = (W, W, W, W, W, W, W, W);
-    type OpTuple<W> = (W, W, W, W, f64);
+    type OpTuple<W> = (W, W, W, W, W);
     type FunTuple<W> = (W, W, W);
     type PhiTuple<W> = (W, W);
     type SwitchTuple<W> = (W, W);
diff --git a/source/pip/src/qir_simulation/gpu_full_state.rs b/source/pip/src/qir_simulation/gpu_full_state.rs
index fc30a822ca..e0aa4ac7be 100644
--- a/source/pip/src/qir_simulation/gpu_full_state.rs
+++ b/source/pip/src/qir_simulation/gpu_full_state.rs
@@ -38,9 +38,9 @@ pub fn try_create_gpu_adapter() -> PyResult<String> {
 pub fn run_parallel_shots<'py>(
     py: Python<'py>,
     input: &Bound<'py, PyList>,
-    shots: i32,
     qubit_count: i32,
     result_count: i32,
+    shots: i32,
     noise_config: Option<&Bound<'py, NoiseConfig>>,
     seed: Option<u32>,
 ) -> PyResult<Py<PyAny>> {
diff --git a/source/pip/tests/test_adaptive_cpu_quantum_ops.py b/source/pip/tests/test_adaptive_cpu_quantum_ops.py
index 1296b510d2..dcc4f7323e 100644
--- a/source/pip/tests/test_adaptive_cpu_quantum_ops.py
+++ b/source/pip/tests/test_adaptive_cpu_quantum_ops.py
@@ -375,3 +375,63 @@ def test_teleport_chain_histogram(sim_type):
     assert count_00 > 4000, f"Expected ~5000 '00' results, got {count_00}"
     assert count_11 > 4000, f"Expected ~5000 '11' results, got {count_11}"
     assert count_00 + count_11 == 10000, "All shots should produce a result"
+
+
+DYNAMIC_ROTATION_ANGLE_QIR = r"""
+%Result = type opaque
+%Qubit = type opaque
+
+@0 = internal constant [4 x i8] c"0_r\00"
+
+define i64 @ENTRYPOINT__main() #0 {
+block_0:
+  call void @__quantum__rt__initialize(i8* null)
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  %var_1 = call i1 @__quantum__rt__read_result(%Result* inttoptr (i64 0 to %Result*))
+  %var_2 = icmp eq i1 %var_1, false
+  br i1 %var_2, label %block_1, label %block_2
+block_1:
+  br label %block_3
+block_2:
+  br label %block_3
+block_3:
+  %var_3 = phi double [0.5, %block_1], [1.0, %block_2]
+  call void @__quantum__qis__rx__body(double %var_3, %Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @0, i64 0, i64 0))
+  ret i64 0
+}
+
+declare void @__quantum__rt__initialize(i8*)
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*) #1
+declare i1 @__quantum__rt__read_result(%Result*)
+declare void @__quantum__qis__rx__body(double, %Qubit*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "output_labeling_schema" "qir_profiles"="adaptive_profile" "required_num_qubits"="2" "required_num_results"="2" }
+attributes #1 = { "irreversible" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
+
+!0 = !{i32 1, !"qir_major_version", i32 1}
+!1 = !{i32 7, !"qir_minor_version", i32 0}
+!2 = !{i32 1, !"dynamic_qubit_management", i1 false}
+!3 = !{i32 1, !"dynamic_result_management", i1 false}
+!4 = !{i32 5, !"int_computations", !{!"i64"}}
+!5 = !{i32 5, !"float_computations", !{!"double"}}
+"""
+
+
+def test_dynamic_rotation_angle():
+    results = _run(DYNAMIC_ROTATION_ANGLE_QIR, shots=10_000, seed=42, sim_type="cpu")
+    assert len(results) == 10_000
+
+    counts = Counter(results)
+    count_0 = counts.get("0", 0)
+    count_1 = counts.get("1", 0)
+
+    assert count_1 > 1400, f"Expected ~15% '1' results, got {count_1}"
+    assert count_0 > 8400, f"Expected ~85% '0' results, got {count_0}"
+    assert count_0 + count_1 == 10_000, "All shots should produce a result"
diff --git a/source/pip/tests/test_adaptive_gpu_quantum_ops.py b/source/pip/tests/test_adaptive_gpu_quantum_ops.py
index 13b0bc0a1e..5befd7a4c8 100644
--- a/source/pip/tests/test_adaptive_gpu_quantum_ops.py
+++ b/source/pip/tests/test_adaptive_gpu_quantum_ops.py
@@ -37,6 +37,18 @@
 from qsharp._simulation import GpuSimulator
 
 
+# Acquiring the GPU resources takes time, so we acquire them once and use them
+# for all the tests. This is fine since pytest runs tests sequencially.
+sim = GpuSimulator()
+
+
+def run_shots(qir: str, shots: int = 10_000, seed: int = 42):
+    """Run *qir* on the GPU and return the shot_results list."""
+    global sim
+    sim.set_program(qir)
+    return sim.run_shots(shots, seed=seed)
+
+
 # ---------------------------------------------------------------------------
 # QIR source
 # ---------------------------------------------------------------------------
@@ -124,12 +136,12 @@ def test_measure_and_correct_histogram():
     Run 10000 shots and verify ~50/50 split of "0" and "1" outcomes.
     The measurement result records whether H collapsed to |1⟩ (then X corrects).
     """
-    sim = GpuSimulator()
-    sim.set_program(MEASURE_AND_CORRECT_QIR)
-    results = sim.run_shots(10000, seed=42)
-
+    results = run_shots(MEASURE_AND_CORRECT_QIR)
     shot_results = results["shot_results"]
     assert len(shot_results) == 10000
+    assert all(
+        code == 0 for code in results["shot_result_codes"]
+    ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
     counts = Counter(shot_results)
     # Each shot produces a single-bit result string: "0" or "1"
@@ -142,19 +154,6 @@ def test_measure_and_correct_histogram():
     assert count_0 + count_1 == 10000, "All shots should produce a result"
 
 
-@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
-def test_measure_and_correct_no_errors():
-    """Example 1: All shots should complete without GPU errors."""
-    sim = GpuSimulator()
-    sim.set_program(MEASURE_AND_CORRECT_QIR)
-    results = sim.run_shots(1000, seed=123)
-
-    shot_result_codes = results["shot_result_codes"]
-    assert all(
-        code == 0 for code in shot_result_codes
-    ), f"Some shots had non-zero error codes: {[c for c in shot_result_codes if c != 0]}"
-
-
 @pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
 def test_conditional_loop_all_results_are_one():
     """Example 3: The loop exits only when measurement yields 1.
@@ -163,12 +162,12 @@ def test_conditional_loop_all_results_are_one():
     until that outcome.
     """
     shots = 5000
-    sim = GpuSimulator()
-    sim.set_program(CONDITIONAL_LOOP_QIR)
-    results = sim.run_shots(shots, seed=99)
-
+    results = run_shots(CONDITIONAL_LOOP_QIR, shots=shots)
     shot_results = results["shot_results"]
     assert len(shot_results) == shots
+    assert all(
+        code == 0 for code in results["shot_result_codes"]
+    ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
     counts = Counter(shot_results)
     # Every shot should exit with result "1"
@@ -177,19 +176,6 @@ def test_conditional_loop_all_results_are_one():
     ), f"Expected all {shots} shots to produce '1', got counts: {counts}"
 
 
-@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
-def test_conditional_loop_no_errors():
-    """Example 3: All shots should complete without GPU errors."""
-    sim = GpuSimulator()
-    sim.set_program(CONDITIONAL_LOOP_QIR)
-    results = sim.run_shots(1000, seed=456)
-
-    shot_result_codes = results["shot_result_codes"]
-    assert all(
-        code == 0 for code in shot_result_codes
-    ), f"Some shots had non-zero error codes: {[c for c in shot_result_codes if c != 0]}"
-
-
 # Example 2: Loop with phi node — GHZ state preparation
 # Applies H to qubit 0, then loops from i=1 to 4,
 # applying CNOT(q0, q_i) in each iteration using a phi node
@@ -295,12 +281,12 @@ def test_loop_with_phi_ghz_histogram():
     Creates (|00000⟩ + |11111⟩)/√2. All 5 measurements must agree.
     Run 10000 shots and verify only "00000" and "11111" appear near 50/50.
     """
-    sim = GpuSimulator()
-    sim.set_program(LOOP_WITH_PHI_QIR)
-    results = sim.run_shots(10000, seed=42)
-
+    results = run_shots(LOOP_WITH_PHI_QIR)
     shot_results = results["shot_results"]
     assert len(shot_results) == 10000
+    assert all(
+        code == 0 for code in results["shot_result_codes"]
+    ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
     counts = Counter(shot_results)
     # Only "00000" and "11111" should appear
@@ -317,19 +303,6 @@ def test_loop_with_phi_ghz_histogram():
     assert count_00000 + count_11111 == 10000, "All shots should produce a result"
 
 
-@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
-def test_loop_with_phi_no_errors():
-    """Example 2: All shots should complete without GPU errors."""
-    sim = GpuSimulator()
-    sim.set_program(LOOP_WITH_PHI_QIR)
-    results = sim.run_shots(1000, seed=123)
-
-    shot_result_codes = results["shot_result_codes"]
-    assert all(
-        code == 0 for code in shot_result_codes
-    ), f"Some shots had non-zero error codes: {[c for c in shot_result_codes if c != 0]}"
-
-
 # ---------------------------------------------------------------------------
 # Tests — Example 4: Boolean computation (AND gate)
 # ---------------------------------------------------------------------------
@@ -342,12 +315,12 @@ def test_boolean_computation_histogram():
     r2=1 only when both r0=1 AND r1=1 (~25% of shots).
     Run 10000 shots and verify ~25% "1" and ~75% "0".
     """
-    sim = GpuSimulator()
-    sim.set_program(BOOLEAN_COMPUTATION_QIR)
-    results = sim.run_shots(10000, seed=42)
-
+    results = run_shots(BOOLEAN_COMPUTATION_QIR)
     shot_results = results["shot_results"]
     assert len(shot_results) == 10000
+    assert all(
+        code == 0 for code in results["shot_result_codes"]
+    ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
     counts = Counter(shot_results)
     count_0 = counts.get("0", 0)
@@ -358,19 +331,6 @@ def test_boolean_computation_histogram():
     assert count_0 + count_1 == 10000, "All shots should produce a result"
 
 
-@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
-def test_boolean_computation_no_errors():
-    """Example 4: All shots should complete without GPU errors."""
-    sim = GpuSimulator()
-    sim.set_program(BOOLEAN_COMPUTATION_QIR)
-    results = sim.run_shots(1000, seed=456)
-
-    shot_result_codes = results["shot_result_codes"]
-    assert all(
-        code == 0 for code in shot_result_codes
-    ), f"Some shots had non-zero error codes: {[c for c in shot_result_codes if c != 0]}"
-
-
 # ---------------------------------------------------------------------------
 # QIR fixture — Example 5: Teleport chain
 # ---------------------------------------------------------------------------
@@ -456,12 +416,12 @@ def test_teleport_chain_histogram():
     Final measurements of q0 and q4 (results 2 and 3, labeled "0_t0" and
     "0_t1") should be correlated: both "0" or both "1", near 50/50.
     """
-    sim = GpuSimulator()
-    sim.set_program(TELEPORT_CHAIN_QIR)
-    results = sim.run_shots(10000, seed=42)
-
+    results = run_shots(TELEPORT_CHAIN_QIR)
     shot_results = results["shot_results"]
     assert len(shot_results) == 10000
+    assert all(
+        code == 0 for code in results["shot_result_codes"]
+    ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
     counts = Counter(shot_results)
     # Only "00" and "11" should appear (results 4 and 5 are correlated)
@@ -478,14 +438,66 @@ def test_teleport_chain_histogram():
     assert count_00 + count_11 == 10000, "All shots should produce a result"
 
 
-@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
-def test_teleport_chain_no_errors():
-    """Example 5: All shots should complete without GPU errors."""
-    sim = GpuSimulator()
-    sim.set_program(TELEPORT_CHAIN_QIR)
-    results = sim.run_shots(1000, seed=789)
+DYNAMIC_ROTATION_ANGLE_QIR = r"""
+%Result = type opaque
+%Qubit = type opaque
+
+@0 = internal constant [4 x i8] c"0_r\00"
+
+define i64 @ENTRYPOINT__main() #0 {
+block_0:
+  call void @__quantum__rt__initialize(i8* null)
+  call void @__quantum__qis__h__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  %var_1 = call i1 @__quantum__rt__read_result(%Result* inttoptr (i64 0 to %Result*))
+  %var_2 = icmp eq i1 %var_1, false
+  br i1 %var_2, label %block_1, label %block_2
+block_1:
+  br label %block_3
+block_2:
+  br label %block_3
+block_3:
+  %var_3 = phi double [0.5, %block_1], [1.0, %block_2]
+  call void @__quantum__qis__rx__body(double %var_3, %Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* getelementptr inbounds ([4 x i8], [4 x i8]* @0, i64 0, i64 0))
+  ret i64 0
+}
+
+declare void @__quantum__rt__initialize(i8*)
+declare void @__quantum__qis__h__body(%Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*) #1
+declare i1 @__quantum__rt__read_result(%Result*)
+declare void @__quantum__qis__rx__body(double, %Qubit*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "output_labeling_schema" "qir_profiles"="adaptive_profile" "required_num_qubits"="2" "required_num_results"="2" }
+attributes #1 = { "irreversible" }
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4, !5}
+
+!0 = !{i32 1, !"qir_major_version", i32 1}
+!1 = !{i32 7, !"qir_minor_version", i32 0}
+!2 = !{i32 1, !"dynamic_qubit_management", i1 false}
+!3 = !{i32 1, !"dynamic_result_management", i1 false}
+!4 = !{i32 5, !"int_computations", !{!"i64"}}
+!5 = !{i32 5, !"float_computations", !{!"double"}}
+"""
+
 
-    shot_result_codes = results["shot_result_codes"]
+@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
+def test_dynamic_rotation_angle():
+    results = run_shots(DYNAMIC_ROTATION_ANGLE_QIR)
+    shot_results = results["shot_results"]
+    assert len(shot_results) == 10_000
     assert all(
-        code == 0 for code in shot_result_codes
-    ), f"Some shots had non-zero error codes: {[c for c in shot_result_codes if c != 0]}"
+        code == 0 for code in results["shot_result_codes"]
+    ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
+
+    counts = Counter(shot_results)
+    count_0 = counts.get("0", 0)
+    count_1 = counts.get("1", 0)
+
+    assert count_1 > 1400, f"Expected ~15% '1' results, got {count_1}"
+    assert count_0 > 8400, f"Expected ~85% '0' results, got {count_0}"
+    assert count_0 + count_1 == 10_000, "All shots should produce a result"
diff --git a/source/simulators/src/bytecode.rs b/source/simulators/src/bytecode.rs
index afc77f5685..c477b8abc3 100644
--- a/source/simulators/src/bytecode.rs
+++ b/source/simulators/src/bytecode.rs
@@ -209,12 +209,12 @@ pub struct Op<Word> {
     pub q1: Word,
     pub q2: Word,
     pub q3: Word,
-    pub angle: f64,
+    pub angle: Word,
 }
 
 impl<Word> Op<Word> {
     #[must_use]
-    pub fn from_tuple(t: (Word, Word, Word, Word, f64)) -> Self {
+    pub fn from_tuple(t: (Word, Word, Word, Word, Word)) -> Self {
         Self {
             op_id: t.0,
             q1: t.1,
diff --git a/source/simulators/src/bytecode/runtime.rs b/source/simulators/src/bytecode/runtime.rs
index 02a9ff420c..dfbe161a84 100644
--- a/source/simulators/src/bytecode/runtime.rs
+++ b/source/simulators/src/bytecode/runtime.rs
@@ -235,9 +235,9 @@ fn dispatch_quantum_gate<S: Simulator>(
             .collect();
         sim.correlated_noise_intrinsic(table_id, &targets);
     } else {
+        let angle = rt.resolve_f64(instr.src0, instr.opcode, 0);
         let q1 = rt.resolve_u64(instr.aux1, instr.opcode, 4) as usize;
         let q2 = rt.resolve_u64(instr.aux2, instr.opcode, 5) as usize;
-        let angle = op.angle;
         match op_id {
             OPID_X => sim.x(q1),
             OPID_Y => sim.y(q1),
diff --git a/source/simulators/src/gpu_full_state_simulator/common.wgsl b/source/simulators/src/gpu_full_state_simulator/common.wgsl
index 769c7ee2ca..907dae91d8 100644
--- a/source/simulators/src/gpu_full_state_simulator/common.wgsl
+++ b/source/simulators/src/gpu_full_state_simulator/common.wgsl
@@ -34,6 +34,8 @@ const OPID_S       = 6u;
 const OPID_SAdj    = 7u;
 const OPID_T       = 8u;
 const OPID_TAdj    = 9u;
+const OPID_RX      = 12u;
+const OPID_RY      = 13u;
 const OPID_RZ      = 14u;
 const OPID_CX      = 15u;
 const OPID_CZ      = 16u;
diff --git a/source/simulators/src/gpu_full_state_simulator/shader_types.rs b/source/simulators/src/gpu_full_state_simulator/shader_types.rs
index 5a45c7c0ed..6c1e66c468 100644
--- a/source/simulators/src/gpu_full_state_simulator/shader_types.rs
+++ b/source/simulators/src/gpu_full_state_simulator/shader_types.rs
@@ -1113,7 +1113,7 @@ pub fn build_op_pool(compact_ops: &[bytecode::Op<u32>]) -> Vec<Op> {
                  angle,
              }| {
                 #[allow(clippy::cast_possible_truncation)]
-                let angle_f32 = angle as f32;
+                let angle_f32 = f32::from_bits(angle);
                 match op_id {
                     ops::ID => Op::new_id_gate(q1),
                     ops::RESETZ => Op::new_resetz_gate(q1),
diff --git a/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl b/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
index 47b9af7b50..56f455558e 100644
--- a/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
+++ b/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
@@ -484,6 +484,15 @@ fn resolve_q2(shot_idx: u32) -> u32 {
     return read_reg(shot_idx, instr.aux2);
 }
 
+// Resolves the rotation angle for the current quantum instruction.
+// The angle is stored in the instruction's src0 field (register or immediate).
+fn resolve_gate_angle(shot_idx: u32) -> f32 {
+    let state = shots[shot_idx].interp;
+    let instr = fetch_instr(state.pc - 1);
+    let flags = get_flags(instr.opcode);
+    return resolve_f32(shot_idx, instr.src0, flags, 0u);
+}
+
 fn get_measure_qubit(shot_idx: u32, op_idx: u32) -> u32 {
     return resolve_q1(shot_idx);
 }
@@ -1343,6 +1352,69 @@ fn prepare_op(@builtin(global_invocation_id) globalId: vec3<u32>) {
 
     shot.unitary = op.unitary;
 
+    // For rotation gates, recompute the unitary from the (possibly dynamic) angle
+    // stored in the instruction's src0 field. The op pool unitary was built at upload
+    // time and may not reflect a runtime-computed angle.
+    if op_type == 0u {
+        if op.id == OPID_RX || op.id == OPID_RY || op.id == OPID_RZ {
+            let angle = resolve_gate_angle(shot_idx);
+            let half = angle * 0.5;
+            let c = cos(half);
+            let s = sin(half);
+            if op.id == OPID_RX {
+                // [[cos(θ/2), -i·sin(θ/2)], [-i·sin(θ/2), cos(θ/2)]]
+                shot.unitary[0] = vec2f(c, 0.0);
+                shot.unitary[1] = vec2f(0.0, -s);
+                shot.unitary[4] = vec2f(0.0, -s);
+                shot.unitary[5] = vec2f(c, 0.0);
+            } else if op.id == OPID_RY {
+                // [[cos(θ/2), -sin(θ/2)], [sin(θ/2), cos(θ/2)]]
+                shot.unitary[0] = vec2f(c, 0.0);
+                shot.unitary[1] = vec2f(-s, 0.0);
+                shot.unitary[4] = vec2f(s, 0.0);
+                shot.unitary[5] = vec2f(c, 0.0);
+            } else {
+                // RZ: [[1, 0], [0, e^(iθ)]]
+                shot.unitary[0] = vec2f(1.0, 0.0);
+                shot.unitary[1] = vec2f(0.0, 0.0);
+                shot.unitary[4] = vec2f(0.0, 0.0);
+                shot.unitary[5] = vec2f(cos(angle), sin(angle));
+            }
+        } else if op.id == OPID_RXX || op.id == OPID_RYY || op.id == OPID_RZZ {
+            let angle = resolve_gate_angle(shot_idx);
+            let half = angle * 0.5;
+            let c = cos(half);
+            let s = sin(half);
+            if op.id == OPID_RXX {
+                // exp(-i·θ/2·X⊗X)
+                shot.unitary[0]  = vec2f(c, 0.0);    // 00,00
+                shot.unitary[3]  = vec2f(0.0, -s);   // 00,11
+                shot.unitary[5]  = vec2f(c, 0.0);    // 01,01
+                shot.unitary[6]  = vec2f(0.0, -s);   // 01,10
+                shot.unitary[9]  = vec2f(0.0, -s);   // 10,01
+                shot.unitary[10] = vec2f(c, 0.0);    // 10,10
+                shot.unitary[12] = vec2f(0.0, -s);   // 11,00
+                shot.unitary[15] = vec2f(c, 0.0);    // 11,11
+            } else if op.id == OPID_RYY {
+                // exp(-i·θ/2·Y⊗Y)
+                shot.unitary[0]  = vec2f(c, 0.0);    // 00,00
+                shot.unitary[3]  = vec2f(0.0, s);    // 00,11 (+i·sin)
+                shot.unitary[5]  = vec2f(c, 0.0);    // 01,01
+                shot.unitary[6]  = vec2f(0.0, -s);   // 01,10
+                shot.unitary[9]  = vec2f(0.0, -s);   // 10,01
+                shot.unitary[10] = vec2f(c, 0.0);    // 10,10
+                shot.unitary[12] = vec2f(0.0, s);    // 11,00 (+i·sin)
+                shot.unitary[15] = vec2f(c, 0.0);    // 11,11
+            } else {
+                // RZZ: diag(1, e^(iθ), e^(iθ), 1)
+                shot.unitary[0]  = vec2f(1.0, 0.0);
+                shot.unitary[5]  = vec2f(cos(angle), sin(angle));
+                shot.unitary[10] = vec2f(cos(angle), sin(angle));
+                shot.unitary[15] = vec2f(1.0, 0.0);
+            }
+        }
+    }
+
     switch op_type {
         case 0u { // Gate
             shot.op_idx = op_idx;

From 590d8b8546eb750e5259ef3c4092d0aa5da53a72 Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Tue, 7 Apr 2026 14:20:15 -0700
Subject: [PATCH 11/14] better comments

---
 .../simulator_adaptive.wgsl                   | 138 ++++++++++--------
 1 file changed, 75 insertions(+), 63 deletions(-)

diff --git a/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl b/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
index 56f455558e..277134c8a7 100644
--- a/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
+++ b/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
@@ -507,6 +507,18 @@ fn read_measurement_result(shot_idx: u32, result_id: u32) -> bool {
     return atomicLoad(&results[shot_idx * RESULT_COUNT + result_id]) == 1u;
 }
 
+// Return true if the id corresponds to a rotation gate.
+fn is_rotation_gate(id: u32) -> bool {
+    return (12 <= id && id <= 14) || (17 <= id && id <= 19);
+}
+
+// Return true if the angle for the current rotation gate is dynamic.
+fn is_dynamic_angle(shot_idx: u32) -> bool {
+    let state = shots[shot_idx].interp;
+    let instr = fetch_instr(state.pc - 1);
+    return (instr.opcode | FLAG_SRC0_IMM) != 0;
+}
+
 // For every qubit, each 'execute' kernel thread will update its own workgroup storage location for accumulating probabilities
 // The final probabilities will be reduced and written back to the shot state after the parallel execution completes.
 struct QubitProbabilityPerThread {
@@ -1352,71 +1364,71 @@ fn prepare_op(@builtin(global_invocation_id) globalId: vec3<u32>) {
 
     shot.unitary = op.unitary;
 
-    // For rotation gates, recompute the unitary from the (possibly dynamic) angle
-    // stored in the instruction's src0 field. The op pool unitary was built at upload
-    // time and may not reflect a runtime-computed angle.
-    if op_type == 0u {
-        if op.id == OPID_RX || op.id == OPID_RY || op.id == OPID_RZ {
-            let angle = resolve_gate_angle(shot_idx);
-            let half = angle * 0.5;
-            let c = cos(half);
-            let s = sin(half);
-            if op.id == OPID_RX {
-                // [[cos(θ/2), -i·sin(θ/2)], [-i·sin(θ/2), cos(θ/2)]]
-                shot.unitary[0] = vec2f(c, 0.0);
-                shot.unitary[1] = vec2f(0.0, -s);
-                shot.unitary[4] = vec2f(0.0, -s);
-                shot.unitary[5] = vec2f(c, 0.0);
-            } else if op.id == OPID_RY {
-                // [[cos(θ/2), -sin(θ/2)], [sin(θ/2), cos(θ/2)]]
-                shot.unitary[0] = vec2f(c, 0.0);
-                shot.unitary[1] = vec2f(-s, 0.0);
-                shot.unitary[4] = vec2f(s, 0.0);
-                shot.unitary[5] = vec2f(c, 0.0);
-            } else {
-                // RZ: [[1, 0], [0, e^(iθ)]]
-                shot.unitary[0] = vec2f(1.0, 0.0);
-                shot.unitary[1] = vec2f(0.0, 0.0);
-                shot.unitary[4] = vec2f(0.0, 0.0);
-                shot.unitary[5] = vec2f(cos(angle), sin(angle));
-            }
-        } else if op.id == OPID_RXX || op.id == OPID_RYY || op.id == OPID_RZZ {
-            let angle = resolve_gate_angle(shot_idx);
-            let half = angle * 0.5;
-            let c = cos(half);
-            let s = sin(half);
-            if op.id == OPID_RXX {
-                // exp(-i·θ/2·X⊗X)
-                shot.unitary[0]  = vec2f(c, 0.0);    // 00,00
-                shot.unitary[3]  = vec2f(0.0, -s);   // 00,11
-                shot.unitary[5]  = vec2f(c, 0.0);    // 01,01
-                shot.unitary[6]  = vec2f(0.0, -s);   // 01,10
-                shot.unitary[9]  = vec2f(0.0, -s);   // 10,01
-                shot.unitary[10] = vec2f(c, 0.0);    // 10,10
-                shot.unitary[12] = vec2f(0.0, -s);   // 11,00
-                shot.unitary[15] = vec2f(c, 0.0);    // 11,11
-            } else if op.id == OPID_RYY {
-                // exp(-i·θ/2·Y⊗Y)
-                shot.unitary[0]  = vec2f(c, 0.0);    // 00,00
-                shot.unitary[3]  = vec2f(0.0, s);    // 00,11 (+i·sin)
-                shot.unitary[5]  = vec2f(c, 0.0);    // 01,01
-                shot.unitary[6]  = vec2f(0.0, -s);   // 01,10
-                shot.unitary[9]  = vec2f(0.0, -s);   // 10,01
-                shot.unitary[10] = vec2f(c, 0.0);    // 10,10
-                shot.unitary[12] = vec2f(0.0, s);    // 11,00 (+i·sin)
-                shot.unitary[15] = vec2f(c, 0.0);    // 11,11
-            } else {
-                // RZZ: diag(1, e^(iθ), e^(iθ), 1)
-                shot.unitary[0]  = vec2f(1.0, 0.0);
-                shot.unitary[5]  = vec2f(cos(angle), sin(angle));
-                shot.unitary[10] = vec2f(cos(angle), sin(angle));
-                shot.unitary[15] = vec2f(1.0, 0.0);
-            }
-        }
-    }
-
     switch op_type {
         case 0u { // Gate
+            // For rotation gates, recompute the unitary from the dynamic angle stored
+            // in the instruction's src0 field if needed. The op pool unitary was built
+            // at upload time and may not reflect a runtime-computed angle.
+            if is_rotation_gate(op.id) && is_dynamic_angle(shot_idx) {
+                if op.id == OPID_RX || op.id == OPID_RY || op.id == OPID_RZ {
+                    let angle = resolve_gate_angle(shot_idx);
+                    let half = angle * 0.5;
+                    let c = cos(half);
+                    let s = sin(half);
+                    if op.id == OPID_RX {
+                        // [[cos(θ/2), -i·sin(θ/2)], [-i·sin(θ/2), cos(θ/2)]]
+                        shot.unitary[0] = vec2f(c, 0.0);
+                        shot.unitary[1] = vec2f(0.0, -s);
+                        shot.unitary[4] = vec2f(0.0, -s);
+                        shot.unitary[5] = vec2f(c, 0.0);
+                    } else if op.id == OPID_RY {
+                        // [[cos(θ/2), -sin(θ/2)], [sin(θ/2), cos(θ/2)]]
+                        shot.unitary[0] = vec2f(c, 0.0);
+                        shot.unitary[1] = vec2f(-s, 0.0);
+                        shot.unitary[4] = vec2f(s, 0.0);
+                        shot.unitary[5] = vec2f(c, 0.0);
+                    } else {
+                        // RZ: [[1, 0], [0, e^(iθ)]]
+                        shot.unitary[0] = vec2f(1.0, 0.0);
+                        shot.unitary[1] = vec2f(0.0, 0.0);
+                        shot.unitary[4] = vec2f(0.0, 0.0);
+                        shot.unitary[5] = vec2f(cos(angle), sin(angle));
+                    }
+                } else if op.id == OPID_RXX || op.id == OPID_RYY || op.id == OPID_RZZ {
+                    let angle = resolve_gate_angle(shot_idx);
+                    let half = angle * 0.5;
+                    let c = cos(half);
+                    let s = sin(half);
+                    if op.id == OPID_RXX {
+                        // exp(-i·θ/2·X⊗X)
+                        shot.unitary[0]  = vec2f(c, 0.0);
+                        shot.unitary[3]  = vec2f(0.0, -s);
+                        shot.unitary[5]  = vec2f(c, 0.0);
+                        shot.unitary[6]  = vec2f(0.0, -s);
+                        shot.unitary[9]  = vec2f(0.0, -s);
+                        shot.unitary[10] = vec2f(c, 0.0);
+                        shot.unitary[12] = vec2f(0.0, -s);
+                        shot.unitary[15] = vec2f(c, 0.0);
+                    } else if op.id == OPID_RYY {
+                        // exp(-i·θ/2·Y⊗Y)
+                        shot.unitary[0]  = vec2f(c, 0.0);
+                        shot.unitary[3]  = vec2f(0.0, s);
+                        shot.unitary[5]  = vec2f(c, 0.0);
+                        shot.unitary[6]  = vec2f(0.0, -s);
+                        shot.unitary[9]  = vec2f(0.0, -s);
+                        shot.unitary[10] = vec2f(c, 0.0);
+                        shot.unitary[12] = vec2f(0.0, s);
+                        shot.unitary[15] = vec2f(c, 0.0);
+                    } else {
+                        // RZZ: diag(1, e^(iθ), e^(iθ), 1)
+                        shot.unitary[0]  = vec2f(1.0, 0.0);
+                        shot.unitary[5]  = vec2f(cos(angle), sin(angle));
+                        shot.unitary[10] = vec2f(cos(angle), sin(angle));
+                        shot.unitary[15] = vec2f(1.0, 0.0);
+                    }
+                }
+            }
+
             shot.op_idx = op_idx;
             shot.op_type = op.id;
 

From 9ef68fc37be4fd048d36415c8dc21db2244ba700 Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Mon, 27 Apr 2026 11:25:28 -0700
Subject: [PATCH 12/14] address PR feedback

---
 source/pip/qsharp/_adaptive_bytecode.py       |  1 +
 source/pip/qsharp/_adaptive_pass.py           | 15 +++-
 source/pip/qsharp/_device/_atom/__init__.py   | 13 +++-
 .../pip/tests/test_adaptive_cpu_bytecode.py   | 56 ++++++++++++++-
 .../pip/tests/test_adaptive_gpu_bytecode.py   | 69 ++++++++++++++++++-
 source/simulators/src/bytecode/runtime.rs     | 17 +++++
 .../simulator_adaptive.wgsl                   | 13 ++++
 7 files changed, 179 insertions(+), 5 deletions(-)

diff --git a/source/pip/qsharp/_adaptive_bytecode.py b/source/pip/qsharp/_adaptive_bytecode.py
index b86b8e2cda..aa244fc59c 100644
--- a/source/pip/qsharp/_adaptive_bytecode.py
+++ b/source/pip/qsharp/_adaptive_bytecode.py
@@ -44,6 +44,7 @@
 OP_RESET = 0x12
 OP_READ_RESULT = 0x13
 OP_RECORD_OUTPUT = 0x14
+OP_READ_LOSS = 0x15
 
 # ── Integer Arithmetic ───────────────────────────────────────────────────────
 OP_ADD = 0x20
diff --git a/source/pip/qsharp/_adaptive_pass.py b/source/pip/qsharp/_adaptive_pass.py
index 4860f28c17..e8ad692dc3 100644
--- a/source/pip/qsharp/_adaptive_pass.py
+++ b/source/pip/qsharp/_adaptive_pass.py
@@ -167,6 +167,13 @@ class QuantumOp:
     q1: int
     q2: int
     q3: int
+    # ``angle`` is stored as the raw bit pattern of an IEEE-754 float
+    # (encoded via ``encode_float_as_bits``) so it can be packed into the
+    # same integer-typed FFI table as the qubit indices. The Rust side
+    # reinterprets these bits as f32/f64 depending on the bytecode width.
+    #
+    # This also follows the same pattern in which floats are encoded as ints
+    # in the ``Instruction`` class.
     angle: int
 
 
@@ -703,9 +710,15 @@ def _emit_call(self, call: pyqir.Call) -> None:
                 | "__quantum__rt__begin_parallel"
                 | "__quantum__rt__end_parallel"
                 | "__quantum__qis__barrier__body"
-                | "__quantum__rt__read_loss"
             ):
                 pass  # No-op
+            case "__quantum__rt__read_loss":
+                # Allocate a bool register and emit OP_READ_LOSS so the runtime
+                # can ask the simulator whether the given result was produced
+                # by measuring a lost qubit. Programs may branch on this value.
+                dst = self._alloc_reg(call, REG_TYPE_BOOL)
+                result_reg = self._resolve_result_operand(call.args[0])
+                self._emit(OP_READ_LOSS, dst=dst, src0=result_reg)
             case _ if callee in self._func_to_id:
                 self._emit_ir_function_call(call)
             case _ if "qdk_noise" in call.callee.attributes.func:
diff --git a/source/pip/qsharp/_device/_atom/__init__.py b/source/pip/qsharp/_device/_atom/__init__.py
index 74bb1a80b2..f58a7ab77f 100644
--- a/source/pip/qsharp/_device/_atom/__init__.py
+++ b/source/pip/qsharp/_device/_atom/__init__.py
@@ -258,9 +258,18 @@ def simulate(
         if noise is None:
             noise = NoiseConfig()
 
-        # Override s, s_adj, and z noise if they are unset
-        # and rz noise is set.
+        # Override t, t_adj, s, s_adj, and z noise if they are unset and rz noise is set.
         if noise and not noise.rz.is_noiseless():
+            if noise.t.is_noiseless():
+                noise.t.x = noise.rz.x
+                noise.t.y = noise.rz.y
+                noise.t.z = noise.rz.z
+                noise.t.loss = noise.rz.loss
+            if noise.t_adj.is_noiseless():
+                noise.t_adj.x = noise.rz.x
+                noise.t_adj.y = noise.rz.y
+                noise.t_adj.z = noise.rz.z
+                noise.t_adj.loss = noise.rz.loss
             if noise.s.is_noiseless():
                 noise.s.x = noise.rz.x
                 noise.s.y = noise.rz.y
diff --git a/source/pip/tests/test_adaptive_cpu_bytecode.py b/source/pip/tests/test_adaptive_cpu_bytecode.py
index 9118b27291..89f8229bf5 100644
--- a/source/pip/tests/test_adaptive_cpu_bytecode.py
+++ b/source/pip/tests/test_adaptive_cpu_bytecode.py
@@ -15,7 +15,7 @@
 
 from collections import Counter
 import pytest
-from qsharp._simulation import run_qir, Result
+from qsharp._simulation import run_qir, NoiseConfig, Result
 import qsharp.openqasm
 from typing import Literal
 
@@ -470,6 +470,60 @@ def test_record_output_ordering(sim_type):
     )
 
 
+# =========================================================================
+# OP_READ_LOSS — read whether a measurement observed qubit loss
+# =========================================================================
+
+READ_LOSS_QIR = """
+entry:
+  ; Apply s to qubit 0 purely for its noise side effect. With
+  ; ``noise.s.loss = 1.0`` the simulator faults qubit 0 as lost on every
+  ; shot, so the next mz on qubit 0 records ``MeasurementResult::Loss``
+  ; into result 0. Qubit 1 is left untouched (no noise on x), so the
+  ; conditional X below cleanly flips it to |1⟩.
+  call void @__quantum__qis__s__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  ; Read the loss bit for result 0 — should be 1 because the qubit was lost.
+  %lost = call i1 @__quantum__rt__read_loss(%Result* inttoptr (i64 0 to %Result*))
+  br i1 %lost, label %then, label %end
+
+then:
+  ; Witness: if read_loss reported true, flip qubit 1 to |1⟩.
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*))
+  br label %end
+
+end:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+"""
+
+READ_LOSS_DECLS = """
+declare i1 @__quantum__rt__read_loss(%Result*)
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_read_loss(sim_type):
+    """rz (with 100% loss) → mz → read_loss → branch on loss → mz witness.
+
+    Record both results: result 0 should always be ``Loss`` ('L'), and
+    result 1 should always be ``One`` ('1') because ``read_loss`` saw the
+    loss and the conditional X was applied to qubit 1.
+    """
+    qir = format_qir(
+        READ_LOSS_QIR,
+        extra_decls=READ_LOSS_DECLS,
+        num_qubits=2,
+        num_results=2,
+    )
+    noise = NoiseConfig()
+    noise.s.loss = 1.0
+    results = run_qir(qir, SHOTS, noise, seed=42, type=sim_type)
+    counts = Counter(map_result_list_to_str(r) for r in results)
+    assert counts == {
+        "L1": SHOTS
+    }, f"Expected all {SHOTS} shots to be 'L1', got {counts}"
+
+
 # #########################################################################
 #  Integer Arithmetic
 # #########################################################################
diff --git a/source/pip/tests/test_adaptive_gpu_bytecode.py b/source/pip/tests/test_adaptive_gpu_bytecode.py
index e7e1335e66..29c9184bac 100644
--- a/source/pip/tests/test_adaptive_gpu_bytecode.py
+++ b/source/pip/tests/test_adaptive_gpu_bytecode.py
@@ -35,7 +35,7 @@
 except OSError as e:
     SKIP_REASON = str(e)
 
-from qsharp._simulation import GpuSimulator
+from qsharp._simulation import GpuSimulator, NoiseConfig, Result, run_qir
 
 # ---------------------------------------------------------------------------
 # Helpers
@@ -57,6 +57,19 @@ def _run(qir: str, shots: int = SHOTS, seed: int = 42):
     return sim.run_shots(shots, seed=seed)
 
 
+def map_result_list_to_str(results):
+    s = ""
+    for r in results:
+        match r:
+            case Result.Zero:
+                s += "0"
+            case Result.One:
+                s += "1"
+            case Result.Loss:
+                s += "L"
+    return s
+
+
 def check_result(
     qir_fragment: str,
     expected: str,
@@ -476,6 +489,60 @@ def test_record_output_ordering():
     check_result(RECORD_OUTPUT_QIR, "10", num_qubits=2, num_results=2)
 
 
+# =========================================================================
+# OP_READ_LOSS — read whether a measurement observed qubit loss
+# =========================================================================
+
+READ_LOSS_QIR = """
+entry:
+  ; Apply s to qubit 0 purely for its noise side effect. With
+  ; ``noise.s.loss = 1.0`` the simulator faults qubit 0 as lost on every
+  ; shot, so the next mz on qubit 0 records ``MeasurementResult::Loss``
+  ; into result 0. Qubit 1 is left untouched (no noise on x), so the
+  ; conditional X below cleanly flips it to |1⟩.
+  call void @__quantum__qis__s__body(%Qubit* inttoptr (i64 0 to %Qubit*))
+  call void @__quantum__qis__mz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  ; Read the loss bit for result 0 — should be 1 because the qubit was lost.
+  %lost = call i1 @__quantum__rt__read_loss(%Result* inttoptr (i64 0 to %Result*))
+  br i1 %lost, label %then, label %end
+
+then:
+  ; Witness: if read_loss reported true, flip qubit 1 to |1⟩.
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*))
+  br label %end
+
+end:
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+"""
+
+READ_LOSS_DECLS = """\
+declare i1 @__quantum__rt__read_loss(%Result*)
+"""
+
+
+@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
+def test_read_loss():
+    """s (with 100% loss) → mz → read_loss → branch on loss → mz witness.
+
+    Record both results: result 0 should always be ``Loss`` ('L'), and
+    result 1 should always be ``One`` ('1') because ``read_loss`` saw the
+    loss and the conditional X was applied to qubit 1.
+    """
+    qir = format_qir(
+        READ_LOSS_QIR,
+        extra_decls=READ_LOSS_DECLS,
+        num_qubits=2,
+        num_results=2,
+    )
+    noise = NoiseConfig()
+    noise.s.loss = 1.0
+    results = run_qir(qir, SHOTS, noise, seed=42, type="gpu")
+    counts = Counter(map_result_list_to_str(r) for r in results)
+    assert counts == {
+        "L1": SHOTS
+    }, f"Expected all {SHOTS} shots to be 'L1', got {counts}"
+
+
 # #########################################################################
 #  Integer Arithmetic
 # #########################################################################
diff --git a/source/simulators/src/bytecode/runtime.rs b/source/simulators/src/bytecode/runtime.rs
index dfbe161a84..ebe136b725 100644
--- a/source/simulators/src/bytecode/runtime.rs
+++ b/source/simulators/src/bytecode/runtime.rs
@@ -48,6 +48,7 @@ const OP_MEASURE: u8 = 0x11;
 const OP_RESET: u8 = 0x12;
 const OP_READ_RESULT: u8 = 0x13;
 const OP_RECORD_OUTPUT: u8 = 0x14;
+const OP_READ_LOSS: u8 = 0x15;
 
 // Integer arithmetic
 const OP_ADD: u8 = 0x20;
@@ -424,6 +425,22 @@ pub fn run_shot<S: Simulator>(program: &AdaptiveProgram<u64>, sim: &mut S) {
                 rt.pc += 1;
             }
 
+            OP_READ_LOSS => {
+                // Reports whether the measurement that produced this result
+                // observed a lost qubit. The simulator records ``Loss`` in
+                // its measurement buffer when the qubit was lost prior to
+                // the measurement; here we simply project that to a 1/0 bool
+                // for the program to branch on.
+                let result_id = rt.resolve_u64(instr.src0, flags, 0) as usize;
+                let measurements = sim.measurements();
+                let val = u64::from(
+                    result_id < measurements.len()
+                        && matches!(measurements[result_id], MeasurementResult::Loss),
+                );
+                rt.write_reg(instr.dst, val);
+                rt.pc += 1;
+            }
+
             OP_RECORD_OUTPUT => {
                 // No-op on CPU — results are read from the simulator directly.
                 rt.pc += 1;
diff --git a/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl b/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
index 277134c8a7..8dea1dc259 100644
--- a/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
+++ b/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
@@ -322,6 +322,7 @@ const OP_MEASURE:       u32 = 0x11;
 const OP_RESET:         u32 = 0x12;
 const OP_READ_RESULT:   u32 = 0x13;
 const OP_RECORD_OUTPUT: u32 = 0x14;
+const OP_READ_LOSS:     u32 = 0x15;
 
 // -- Integer Arithmetic -------------------------------------------------------
 const OP_ADD:           u32 = 0x20;
@@ -946,6 +947,18 @@ fn interpret_classical(@builtin(global_invocation_id) gid: vec3<u32>) {
                 pc++;
             }
 
+            // READ_LOSS: Reports whether the measurement that produced a
+            // result observed a lost qubit. The per-shot ``results`` buffer
+            // encodes loss as the value 2u (0u = Zero, 1u = One, 2u = Loss),
+            // so we compare against 2u and write 1u when the result was a loss,
+            // else 0u.
+            case OP_READ_LOSS {
+                let result_id = instr.src0;
+                let val = atomicLoad(&results[shot_idx * RESULT_COUNT + result_id]);
+                write_reg(shot_idx, instr.dst, select(0u, 1u, val == 2u));
+                pc++;
+            }
+
             // -------------------------------------------------------------
             // INTEGER ARITHMETIC
             // -------------------------------------------------------------

From 2bf618d3d2f6d33f5274559456c301c68880effb Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Mon, 27 Apr 2026 18:43:01 -0700
Subject: [PATCH 13/14] Fix multiple bugs uncovered by recently enable
 integration tests

Adds quantum_move instruction to bytecode, fix output recording for adaptive profile, and fix a few bugs in the adaptive GPU shader.
---
 source/pip/qsharp/_adaptive_pass.py           |  25 ++++
 source/pip/qsharp/_simulation.py              |  58 +++++-----
 .../pip/tests/test_adaptive_cpu_bytecode.py   | 108 +++++++++++++++++-
 source/pip/tests/test_adaptive_cpu_noise.py   |   2 +-
 .../tests/test_adaptive_cpu_quantum_ops.py    |  19 +--
 .../pip/tests/test_adaptive_gpu_bytecode.py   | 105 +++++++++++++++--
 source/pip/tests/test_adaptive_gpu_noise.py   |   4 +-
 .../tests/test_adaptive_gpu_quantum_ops.py    |  32 ++++--
 .../gpu_full_state_simulator/gpu_context.rs   |   3 +-
 .../simulator_adaptive.wgsl                   |  19 ++-
 10 files changed, 307 insertions(+), 68 deletions(-)

diff --git a/source/pip/qsharp/_adaptive_pass.py b/source/pip/qsharp/_adaptive_pass.py
index e8ad692dc3..b184e9e28d 100644
--- a/source/pip/qsharp/_adaptive_pass.py
+++ b/source/pip/qsharp/_adaptive_pass.py
@@ -55,6 +55,7 @@ class Bytecode(Enum):
     "mz": 21,
     "mresetz": 22,
     "swap": 24,
+    "move": 28,
 }
 
 # Gates that take a result ID as a second argument
@@ -66,6 +67,12 @@ class Bytecode(Enum):
 # Rotation gates that take an angle parameter as first argument
 ROTATION_GATES = {"rx", "ry", "rz", "rxx", "ryy", "rzz"}
 
+# Single-qubit gates whose QIR signature carries device-specific extra
+# arguments after the qubit pointer (e.g. ``move(qubit, i64, i64)``). The
+# extra args are scheduling metadata for hardware backends and are not
+# qubit IDs, so we resolve only ``args[0]`` and ignore the rest.
+MOVE_GATES = {"move"}
+
 # ---------------------------------------------------------------------------
 # ICmp / FCmp predicate mappings
 # ---------------------------------------------------------------------------
@@ -783,6 +790,24 @@ def _emit_quantum_call(self, call: pyqir.Call) -> None:
                 aux1=q,
             )
             return
+        if gate_name in MOVE_GATES:
+            # ``move(qubit, i64, i64)``: only the first arg is a qubit; the
+            # remaining args are device-specific scheduling metadata that
+            # the simulator ignores. Emit a single-qubit OP_QUANTUM_GATE so
+            # the runtime invokes ``Simulator::mov`` (which applies the
+            # configured ``noise.mov`` faults to that qubit).
+            q1, q2, q3 = self._resolve_qubit_operands([call.args[0]])
+            angle = FloatOperand(0.0, self._bytecode_kind)
+            qop_idx = self._emit_quantum_op(op_id, q1.val, q2.val, q3.val, angle.val)
+            self._emit(
+                OP_QUANTUM_GATE,
+                src0=angle,
+                aux0=qop_idx,
+                aux1=q1,
+                aux2=q2,
+                aux3=q3,
+            )
+            return
         if gate_name in ROTATION_GATES:
             qubit_arg_offset = 1
             angle = self._resolve_angle_operand(call.args[0])
diff --git a/source/pip/qsharp/_simulation.py b/source/pip/qsharp/_simulation.py
index d3ee5a5b8c..b20a2ef54b 100644
--- a/source/pip/qsharp/_simulation.py
+++ b/source/pip/qsharp/_simulation.py
@@ -530,6 +530,7 @@ def run_base(
 
 def run_adaptive(
     rust_run_adaptive_fn: Callable,
+    mod: pyqir.Module,
     program: AdaptiveProgram,
     shots: int,
     noise: Optional[NoiseConfig],
@@ -539,18 +540,9 @@ def run_adaptive(
     Runs an adaptive profile program given a rust simulator. Adds output recording logic.
     """
     results = rust_run_adaptive_fn(program.as_dict(), shots, noise, seed)
-    # Extract recorded output result indices from the bytecode.
-    # OP_RECORD_OUTPUT with aux1=0 is result_record_output where
-    # src0 is the result index in the results buffer.
-    recorded_result_indices = []
-    for ins in program.instructions:
-        if (ins.opcode & 0xFF) == OP_RECORD_OUTPUT and ins.aux1 == 0:
-            recorded_result_indices.append(ins.src0)
-    # Filter shot_results to only include recorded output indices
-    filtered = []
-    for s in results:
-        filtered.append([str_to_result(s[i]) for i in recorded_result_indices])
-    return filtered
+    recorder = OutputRecordingPass()
+    recorder.run(mod)
+    return list(map(recorder.process_output, results))
 
 
 def run_qir_clifford(
@@ -562,7 +554,7 @@ def run_qir_clifford(
     (mod, shots, noise, seed) = preprocess_simulation_input(input, shots, noise, seed)
     if is_adaptive(mod):
         program = AdaptiveProfilePass(Bytecode.Bit64).run(mod, noise)
-        return run_adaptive(run_clifford_adaptive, program, shots, noise, seed)
+        return run_adaptive(run_clifford_adaptive, mod, program, shots, noise, seed)
     else:
         return run_base(run_clifford, mod, shots, noise, seed)
 
@@ -577,7 +569,7 @@ def run_qir_cpu(
     DecomposeCcxPass().run(mod)
     if is_adaptive(mod):
         program = AdaptiveProfilePass(Bytecode.Bit64).run(mod, noise)
-        return run_adaptive(run_cpu_adaptive, program, shots, noise, seed)
+        return run_adaptive(run_cpu_adaptive, mod, program, shots, noise, seed)
     else:
         return run_base(run_cpu_full_state, mod, shots, noise, seed)
 
@@ -593,7 +585,9 @@ def run_qir_gpu(
     DecomposeCcxPass().run(mod)
     if is_adaptive(mod):
         program = AdaptiveProfilePass(Bytecode.Bit32).run(mod, noise)
-        return run_adaptive(run_adaptive_parallel_shots, program, shots, noise, seed)
+        return run_adaptive(
+            run_adaptive_parallel_shots, mod, program, shots, noise, seed
+        )
     else:
         return run_base(run_parallel_shots, mod, shots, noise, seed)
 
@@ -625,7 +619,7 @@ class GpuSimulator:
     def __init__(self):
         self.gpu_context = GpuContext()
         self._is_adaptive = False
-        self._recorded_result_indices = []
+        self._recorder = None
         self.tables = None
 
     def load_noise_tables(
@@ -667,15 +661,11 @@ def set_program(self, input: Union[QirInputData, str, bytes]):
                 mod, noise_intrinsics=noise_intrinsics
             )
             self.gpu_context.set_adaptive_program(program.as_dict())
-
-            # Extract recorded output result indices from the bytecode.
-            # OP_RECORD_OUTPUT with aux1=0 is result_record_output where
-            # src0 is the result index in the results buffer.
-            self._recorded_result_indices = []
-            for instr in program.instructions:
-                if instr.opcode & 0xFF == OP_RECORD_OUTPUT and instr.aux1 == 0:
-                    self._recorded_result_indices.append(instr.src0)
+            # This is used later for output recording
+            self._recorder = OutputRecordingPass()
+            self._recorder.run(mod)
         else:
+            self._is_adaptive = False
             (self.gates, self.required_num_qubits, self.required_num_results) = (
                 prepare_qir_with_correlated_noise(
                     input, self.tables if not self.tables is None else []
@@ -693,13 +683,19 @@ def run_shots(self, shots: int, seed: Optional[int] = None) -> "GpuShotResults":
         seed = seed if seed is not None else random.randint(0, 2**32 - 1)
         if self._is_adaptive:
             results = self.gpu_context.run_adaptive_shots(shots, seed=seed)
-            # Filter shot_results to only include recorded output indices
-            if self._recorded_result_indices:
-                indices = self._recorded_result_indices
-                filtered = []
-                for s in results["shot_results"]:
-                    filtered.append("".join(s[i] for i in indices))
-                results["shot_results"] = filtered
+            for i, (shot_ret_code, shot_result) in enumerate(
+                zip(results["shot_result_codes"], results["shot_results"])
+            ):
+                if shot_ret_code == 0:
+                    # If the ret_code was zero, we do an output recording pass
+                    # on the output.
+                    results["shot_results"][i] = self._recorder.process_output(
+                        shot_result
+                    )
+                else:
+                    # If the shot finished with a ret_code other than zero,
+                    # we set the result to `None`.
+                    results["shot_results"][i] = None
             return results
         return self.gpu_context.run_shots(shots, seed=seed)
 
diff --git a/source/pip/tests/test_adaptive_cpu_bytecode.py b/source/pip/tests/test_adaptive_cpu_bytecode.py
index 89f8229bf5..632bc69564 100644
--- a/source/pip/tests/test_adaptive_cpu_bytecode.py
+++ b/source/pip/tests/test_adaptive_cpu_bytecode.py
@@ -32,8 +32,11 @@
 
 def map_result_list_to_str(results):
     results_str = ""
-    for r in results:
-        match r:
+    if isinstance(results, (list, tuple)):
+        for r in results:
+            results_str += map_result_list_to_str(r)
+    else:
+        match results:
             case Result.Zero:
                 results_str += "0"
             case Result.One:
@@ -186,15 +189,12 @@ def test_nop_smoke(sim_type):
 
 RET_QIR = """
 entry:
-  ret i64 0
-  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
-  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
 """
 
 
 @pytest.mark.parametrize("sim_type", SIM_TYPES)
 def test_ret(sim_type):
-    check_result(RET_QIR, "0", sim_type=sim_type)
+    check_result(RET_QIR, "", sim_type=sim_type, num_qubits=0, num_results=0)
 
 
 # =========================================================================
@@ -524,6 +524,46 @@ def test_read_loss(sim_type):
     }, f"Expected all {SHOTS} shots to be 'L1', got {counts}"
 
 
+# =========================================================================
+# move (OpID 28) — qubit move with associated noise
+# =========================================================================
+
+MOVE_QIR = """
+entry:
+  ; ``move`` is a no-op on the simulator state, but the simulator applies
+  ; the configured ``noise.mov`` faults to the moved qubit. With
+  ; ``noise.mov.x = 1.0`` every move flips the qubit, so q0 ends in |1⟩.
+  call void @__quantum__qis__move__body(%Qubit* inttoptr (i64 0 to %Qubit*), i64 0, i64 0)
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+MOVE_DECLS = """\
+declare void @__quantum__qis__move__body(%Qubit*, i64, i64)
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_move_applies_noise(sim_type):
+    """move (with 100% X noise) → mz → always 1."""
+    qir = format_qir(MOVE_QIR, extra_decls=MOVE_DECLS, num_qubits=1, num_results=1)
+    noise = NoiseConfig()
+    noise.mov.x = 1.0
+    results = run_qir(qir, SHOTS, noise, seed=42, type=sim_type)
+    counts = Counter(map_result_list_to_str(r) for r in results)
+    assert counts == {"1": SHOTS}, f"Expected all {SHOTS} shots to be '1', got {counts}"
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_move_noiseless_is_noop(sim_type):
+    """move without noise is a pure no-op → q0 stays in |0⟩ → measure 0."""
+    check_result(
+        MOVE_QIR,
+        "0",
+        extra_decls=MOVE_DECLS,
+        sim_type=sim_type,
+    )
+
+
 # #########################################################################
 #  Integer Arithmetic
 # #########################################################################
@@ -1313,6 +1353,62 @@ def test_shift_bitwise_chain(sim_type):
     check_arith_result(SHIFT_BITWISE_CHAIN_QIR, "1", sim_type=sim_type)
 
 
+# #########################################################################
+#  Structured Output Recording
+# #########################################################################
+
+
+NESTED_OUTPUT_QIR = """\
+%Result = type opaque
+%Qubit = type opaque
+
+define i64 @ENTRYPOINT__main() #0 {
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 3 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 3 to %Qubit*), %Result* inttoptr (i64 3 to %Result*))
+  call void @__quantum__rt__tuple_record_output(i64 2, i8* null)
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 0 to %Result*), i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 3 to %Result*), i8* null)
+  ret i64 0
+}
+
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
+declare void @__quantum__rt__array_record_output(i64, i8*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="4" "required_num_results"="4" }
+"""
+
+
+@pytest.mark.parametrize("sim_type", SIM_TYPES)
+def test_nested_output_structure(sim_type):
+    """Verify that adaptive results preserve nested tuple/array structure.
+
+    The QIR records output as a tuple of two arrays: ([r0, r1], [r2, r3]).
+    Before the fix, run_adaptive flattened this into [r0, r1, r2, r3].
+    """
+    results = run_qir(NESTED_OUTPUT_QIR, shots=10, seed=42, type=sim_type)
+    for shot in results:
+        assert isinstance(shot, tuple), f"Expected tuple, got {type(shot)}: {shot}"
+        assert len(shot) == 2, f"Expected 2-element tuple, got {len(shot)}: {shot}"
+        assert isinstance(
+            shot[0], list
+        ), f"Expected list, got {type(shot[0])}: {shot[0]}"
+        assert isinstance(
+            shot[1], list
+        ), f"Expected list, got {type(shot[1])}: {shot[1]}"
+        assert shot == ([Result.Zero, Result.One], [Result.Zero, Result.One])
+
+
 # =========================================================================
 # OP_SWITCH with computed value from arithmetic
 # =========================================================================
diff --git a/source/pip/tests/test_adaptive_cpu_noise.py b/source/pip/tests/test_adaptive_cpu_noise.py
index df829528e2..303c56e2ab 100644
--- a/source/pip/tests/test_adaptive_cpu_noise.py
+++ b/source/pip/tests/test_adaptive_cpu_noise.py
@@ -348,7 +348,7 @@ def test_noise_intrinsic_1q_x_flip(sim_type):
     table = noise.intrinsic("noise_1q", 1)
     table.x = 1.0
     output = run_qir(QIR_NOISE_1Q, shots=1, noise=noise, type=sim_type)
-    assert output == [[Result.One]]
+    assert output == [Result.One]
 
 
 QASM_NOISE_2Q = """
diff --git a/source/pip/tests/test_adaptive_cpu_quantum_ops.py b/source/pip/tests/test_adaptive_cpu_quantum_ops.py
index dcc4f7323e..0a119f7a92 100644
--- a/source/pip/tests/test_adaptive_cpu_quantum_ops.py
+++ b/source/pip/tests/test_adaptive_cpu_quantum_ops.py
@@ -26,16 +26,19 @@
 
 
 def map_result_list_to_str(results):
-    results_str = ""
-    for r in results:
-        match r:
+    s = ""
+    if isinstance(results, (list, tuple)):
+        for r in results:
+            s += map_result_list_to_str(r)
+    else:
+        match results:
             case Result.Zero:
-                results_str += "0"
+                s += "0"
             case Result.One:
-                results_str += "1"
+                s += "1"
             case Result.Loss:
-                results_str += "L"
-    return results_str
+                s += "L"
+    return s
 
 
 def _run(
@@ -238,6 +241,7 @@ def _run(
   call void @__quantum__qis__reset__body(%Qubit* inttoptr (i64 4 to %Qubit*))
   br label %exit
 exit:
+  call void @__quantum__rt__tuple_record_output(i64 2, i8* null)
   call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @0, i32 0, i32 0))
   call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 3 to %Result*), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @1, i32 0, i32 0))
   ret void
@@ -252,6 +256,7 @@ def _run(
 declare void @__quantum__rt__initialize(i8*)
 declare i1 @__quantum__qis__read_result__body(%Result*)
 declare void @__quantum__rt__result_record_output(%Result*, i8*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
 
 attributes #0 = { "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="5" "required_num_results"="4" }
 attributes #1 = { "irreversible" }
diff --git a/source/pip/tests/test_adaptive_gpu_bytecode.py b/source/pip/tests/test_adaptive_gpu_bytecode.py
index 29c9184bac..195aec5262 100644
--- a/source/pip/tests/test_adaptive_gpu_bytecode.py
+++ b/source/pip/tests/test_adaptive_gpu_bytecode.py
@@ -59,8 +59,11 @@ def _run(qir: str, shots: int = SHOTS, seed: int = 42):
 
 def map_result_list_to_str(results):
     s = ""
-    for r in results:
-        match r:
+    if isinstance(results, (list, tuple)):
+        for r in results:
+            s += map_result_list_to_str(r)
+    else:
+        match results:
             case Result.Zero:
                 s += "0"
             case Result.One:
@@ -88,7 +91,7 @@ def check_result(
         record=record,
     )
     results = _run(qir, SHOTS)["shot_results"]
-    counts = Counter(results)
+    counts = Counter(map_result_list_to_str(r) for r in results)
     assert counts == {
         expected: SHOTS
     }, f"Expected all {SHOTS} shots to be '{expected}', got {counts}"
@@ -202,8 +205,6 @@ def test_nop_smoke():
 # Every test already exercises RET implicitly. This tests an explicit early ret.
 RET_QIR = """
 entry:
-  ret i64 0
-  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 0 to %Qubit*))
   call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
 """
 
@@ -543,6 +544,41 @@ def test_read_loss():
     }, f"Expected all {SHOTS} shots to be 'L1', got {counts}"
 
 
+# =========================================================================
+# move (OpID 28) — qubit move with associated noise
+# =========================================================================
+
+MOVE_QIR = """
+entry:
+  ; ``move`` is a no-op on the simulator state, but the simulator applies
+  ; the configured ``noise.mov`` faults to the moved qubit. With
+  ; ``noise.mov.x = 1.0`` every move flips the qubit, so q0 ends in |1⟩.
+  call void @__quantum__qis__move__body(%Qubit* inttoptr (i64 0 to %Qubit*), i64 0, i64 0)
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+"""
+
+MOVE_DECLS = """\
+declare void @__quantum__qis__move__body(%Qubit*, i64, i64)
+"""
+
+
+@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
+def test_move_applies_noise():
+    """move (with 100% X noise) → mz → always 1."""
+    qir = format_qir(MOVE_QIR, extra_decls=MOVE_DECLS, num_qubits=1, num_results=1)
+    noise = NoiseConfig()
+    noise.mov.x = 1.0
+    results = run_qir(qir, SHOTS, noise, seed=42, type="gpu")
+    counts = Counter(map_result_list_to_str(r) for r in results)
+    assert counts == {"1": SHOTS}, f"Expected all {SHOTS} shots to be '1', got {counts}"
+
+
+@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
+def test_move_noiseless_is_noop():
+    """move without noise is a pure no-op → q0 stays in |0⟩ → measure 0."""
+    check_result(MOVE_QIR, "0", extra_decls=MOVE_DECLS)
+
+
 # #########################################################################
 #  Integer Arithmetic
 # #########################################################################
@@ -1276,7 +1312,7 @@ def test_dynamic_qubit_loop():
     """3-qubit GHZ via dynamic qubit loop — only '000' and '111' should appear."""
     qir = format_qir(DYNAMIC_QUBIT_LOOP_QIR, num_qubits=3, num_results=3)
     results = _run(qir, shots=5000, seed=42)["shot_results"]
-    counts = Counter(results)
+    counts = Counter(map_result_list_to_str(r) for r in results)
     assert set(counts.keys()) <= {"000", "111"}, f"Unexpected GHZ outcomes: {counts}"
     assert counts.get("000", 0) > 1500
     assert counts.get("111", 0) > 1500
@@ -1362,6 +1398,61 @@ def test_switch_from_arithmetic():
     check_result(SWITCH_ARITH_QIR, "1")
 
 
+# #########################################################################
+#  Structured Output Recording
+# #########################################################################
+
+NESTED_OUTPUT_QIR = """\
+%Result = type opaque
+%Qubit = type opaque
+
+define i64 @ENTRYPOINT__main() #0 {
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 1 to %Qubit*))
+  call void @__quantum__qis__x__body(%Qubit* inttoptr (i64 3 to %Qubit*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 0 to %Qubit*), %Result* inttoptr (i64 0 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 1 to %Qubit*), %Result* inttoptr (i64 1 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 2 to %Qubit*), %Result* inttoptr (i64 2 to %Result*))
+  call void @__quantum__qis__mresetz__body(%Qubit* inttoptr (i64 3 to %Qubit*), %Result* inttoptr (i64 3 to %Result*))
+  call void @__quantum__rt__tuple_record_output(i64 2, i8* null)
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 0 to %Result*), i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 1 to %Result*), i8* null)
+  call void @__quantum__rt__array_record_output(i64 2, i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* null)
+  call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 3 to %Result*), i8* null)
+  ret i64 0
+}
+
+declare void @__quantum__qis__x__body(%Qubit*)
+declare void @__quantum__qis__mresetz__body(%Qubit*, %Result*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
+declare void @__quantum__rt__array_record_output(i64, i8*)
+declare void @__quantum__rt__result_record_output(%Result*, i8*)
+
+attributes #0 = { "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="4" "required_num_results"="4" }
+"""
+
+
+@pytest.mark.skipif(not GPU_AVAILABLE, reason=SKIP_REASON)
+def test_nested_output_structure():
+    """Verify that adaptive results preserve nested tuple/array structure.
+
+    The QIR records output as a tuple of two arrays: ([r0, r1], [r2, r3]).
+    Before the fix, run_adaptive flattened this into [r0, r1, r2, r3].
+    """
+    results = _run(NESTED_OUTPUT_QIR, shots=10)
+    for shot in results["shot_results"]:
+        assert isinstance(shot, tuple), f"Expected tuple, got {type(shot)}: {shot}"
+        assert len(shot) == 2, f"Expected 2-element tuple, got {len(shot)}: {shot}"
+        assert isinstance(
+            shot[0], list
+        ), f"Expected list, got {type(shot[0])}: {shot[0]}"
+        assert isinstance(
+            shot[1], list
+        ), f"Expected list, got {type(shot[1])}: {shot[1]}"
+        assert shot == ([Result.Zero, Result.One], [Result.Zero, Result.One])
+
+
 # =========================================================================
 # Float: sitofp → fadd → fptosi round-trip
 # =========================================================================
@@ -1722,7 +1813,7 @@ def test_complex_rus_exceeds_128_registers():
 bit[4] result = measure q;
 """
     results = _run_openqasm(qasm_src, shots=100)
-    shot_results = results["shot_results"]
+    shot_results = [map_result_list_to_str(r) for r in results["shot_results"]]
     # Results include the mid-circuit measurement bit plus 4 final qubits
     assert all(
         len(r) >= 4 and all(c in "01" for c in r) for r in shot_results
diff --git a/source/pip/tests/test_adaptive_gpu_noise.py b/source/pip/tests/test_adaptive_gpu_noise.py
index 819f773b30..d3795db810 100644
--- a/source/pip/tests/test_adaptive_gpu_noise.py
+++ b/source/pip/tests/test_adaptive_gpu_noise.py
@@ -283,7 +283,7 @@ def test_noise_intrinsics_gpu_sim_class():
     sim.load_noise_tables("./csv_dir_test")
     sim.set_program(QIR_WITH_CORRELATED_NOISE)
     output = sim.run_shots(shots=1)["shot_results"]
-    assert output == ["101"]
+    assert output == [[Result.One, Result.Zero, Result.One]]
 
 
 NOISE_INTRINSICS_WITH_REGISTERS_QIR = r"""
@@ -373,7 +373,7 @@ def test_noise_intrinsic_1q_x_flip():
     table = noise.intrinsic("noise_1q", 1)
     table.x = 1.0
     output = run_qir(QIR_NOISE_1Q, shots=1, noise=noise, type="gpu")
-    assert output == [[Result.One]]
+    assert output == [Result.One]
 
 
 QASM_NOISE_2Q = """
diff --git a/source/pip/tests/test_adaptive_gpu_quantum_ops.py b/source/pip/tests/test_adaptive_gpu_quantum_ops.py
index 5befd7a4c8..01613fdc3a 100644
--- a/source/pip/tests/test_adaptive_gpu_quantum_ops.py
+++ b/source/pip/tests/test_adaptive_gpu_quantum_ops.py
@@ -34,7 +34,23 @@
 except OSError as e:
     SKIP_REASON = str(e)
 
-from qsharp._simulation import GpuSimulator
+from qsharp._simulation import GpuSimulator, Result
+
+
+def map_result_list_to_str(results):
+    s = ""
+    if isinstance(results, (list, tuple)):
+        for r in results:
+            s += map_result_list_to_str(r)
+    else:
+        match results:
+            case Result.Zero:
+                s += "0"
+            case Result.One:
+                s += "1"
+            case Result.Loss:
+                s += "L"
+    return s
 
 
 # Acquiring the GPU resources takes time, so we acquire them once and use them
@@ -143,7 +159,7 @@ def test_measure_and_correct_histogram():
         code == 0 for code in results["shot_result_codes"]
     ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
-    counts = Counter(shot_results)
+    counts = Counter(map_result_list_to_str(r) for r in shot_results)
     # Each shot produces a single-bit result string: "0" or "1"
     count_0 = counts.get("0", 0)
     count_1 = counts.get("1", 0)
@@ -169,7 +185,7 @@ def test_conditional_loop_all_results_are_one():
         code == 0 for code in results["shot_result_codes"]
     ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
-    counts = Counter(shot_results)
+    counts = Counter(map_result_list_to_str(r) for r in shot_results)
     # Every shot should exit with result "1"
     assert (
         counts.get("1", 0) == shots
@@ -288,7 +304,7 @@ def test_loop_with_phi_ghz_histogram():
         code == 0 for code in results["shot_result_codes"]
     ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
-    counts = Counter(shot_results)
+    counts = Counter(map_result_list_to_str(r) for r in shot_results)
     # Only "00000" and "11111" should appear
     assert set(counts.keys()) <= {
         "00000",
@@ -322,7 +338,7 @@ def test_boolean_computation_histogram():
         code == 0 for code in results["shot_result_codes"]
     ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
-    counts = Counter(shot_results)
+    counts = Counter(map_result_list_to_str(r) for r in shot_results)
     count_0 = counts.get("0", 0)
     count_1 = counts.get("1", 0)
 
@@ -382,6 +398,7 @@ def test_boolean_computation_histogram():
   call void @__quantum__qis__reset__body(%Qubit* inttoptr (i64 4 to %Qubit*))
   br label %exit
 exit:
+  call void @__quantum__rt__tuple_record_output(i64 2, i8* null)
   call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 2 to %Result*), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @0, i32 0, i32 0))
   call void @__quantum__rt__result_record_output(%Result* inttoptr (i64 3 to %Result*), i8* getelementptr inbounds ([5 x i8], [5 x i8]* @1, i32 0, i32 0))
   ret void
@@ -396,6 +413,7 @@ def test_boolean_computation_histogram():
 declare void @__quantum__rt__initialize(i8*)
 declare i1 @__quantum__qis__read_result__body(%Result*)
 declare void @__quantum__rt__result_record_output(%Result*, i8*)
+declare void @__quantum__rt__tuple_record_output(i64, i8*)
 
 attributes #0 = { "entry_point" "qir_profiles"="adaptive_profile" "required_num_qubits"="5" "required_num_results"="4" }
 attributes #1 = { "irreversible" }
@@ -423,7 +441,7 @@ def test_teleport_chain_histogram():
         code == 0 for code in results["shot_result_codes"]
     ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
-    counts = Counter(shot_results)
+    counts = Counter(map_result_list_to_str(r) for r in shot_results)
     # Only "00" and "11" should appear (results 4 and 5 are correlated)
     assert set(counts.keys()) <= {
         "00",
@@ -494,7 +512,7 @@ def test_dynamic_rotation_angle():
         code == 0 for code in results["shot_result_codes"]
     ), f"Some shots had non-zero error codes: {[c for c in results['shot_result_codes'] if c != 0]}"
 
-    counts = Counter(shot_results)
+    counts = Counter(map_result_list_to_str(r) for r in shot_results)
     count_0 = counts.get("0", 0)
     count_1 = counts.get("1", 0)
 
diff --git a/source/simulators/src/gpu_full_state_simulator/gpu_context.rs b/source/simulators/src/gpu_full_state_simulator/gpu_context.rs
index 125841fc31..6ac3d8f310 100644
--- a/source/simulators/src/gpu_full_state_simulator/gpu_context.rs
+++ b/source/simulators/src/gpu_full_state_simulator/gpu_context.rs
@@ -770,7 +770,8 @@ impl GpuContext {
             self.resources.reset_diagnostics_header()?;
 
             // Initialize state vectors and shot data via the init kernel.
-            // The init kernel zeros and configures the base ShotData fields per shot.
+            // The init kernel also zeros the results buffer per shot to prevent
+            // stale exit codes from prior runs leaking via atomicCompareExchangeWeak.
             {
                 let kernels = self.resources.get_kernels()?;
                 let mut encoder = self.resources.get_encoder("Adaptive Init Encoder")?;
diff --git a/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl b/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
index 8dea1dc259..f9c620b6fa 100644
--- a/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
+++ b/source/simulators/src/gpu_full_state_simulator/simulator_adaptive.wgsl
@@ -572,6 +572,13 @@ fn initialize(
         // Set the |0...0> amplitude to 1.0 from the first workgroup & thread for the shot
         stateVector[params.shot_state_vector_start] = vec2f(1.0, 0.0);
         reset_all(params.shot_idx);
+
+        // Zero the results buffer for this shot so stale exit codes from
+        // prior runs do not leak via atomicCompareExchangeWeak in OP_RET.
+        let results_base = u32(params.shot_idx) * RESULT_COUNT;
+        for (var r = 0u; r < RESULT_COUNT; r++) {
+            atomicStore(&results[results_base + r], 0u);
+        }
     }
 }
 
@@ -805,7 +812,7 @@ fn interpret_classical(@builtin(global_invocation_id) gid: vec3<u32>) {
                 let arg_offset = instr.aux2;
                 let func = batch_data.program.function_table[func_id];
                 // Push return info onto the call stack
-                let sp = state.call_sp;
+                let sp = shots[shot_idx].interp.call_sp;
                 // Guard: prevent call stack overflow (max 8 frames)
                 if sp >= 8u {
                     shots[shot_idx].interp.exit_code = ERR_CALL_STACK_OVERFLOW;
@@ -839,7 +846,7 @@ fn interpret_classical(@builtin(global_invocation_id) gid: vec3<u32>) {
             // return register (not 0xFFFFFFFF), copies the return value into
             // that register.
             case OP_CALL_RETURN {
-                if state.call_sp == 0u {
+                if shots[shot_idx].interp.call_sp == 0u {
                     shots[shot_idx].interp.exit_code = ERR_CALL_STACK_UNDERFLOW;
                     let err_idx = (shot_idx + 1) * RESULT_COUNT - 1;
                     atomicCompareExchangeWeak(&results[err_idx], 0u, ERR_CALL_STACK_UNDERFLOW);
@@ -849,11 +856,11 @@ fn interpret_classical(@builtin(global_invocation_id) gid: vec3<u32>) {
                     break;
                 }
 
-                let sp = state.call_sp - 1;
+                let sp = shots[shot_idx].interp.call_sp - 1;
                 shots[shot_idx].interp.call_sp = sp;
-                block_id = state.call_stack_frames[sp].block_id;  // go back to the callers block
-                pc = state.call_stack_frames[sp].return_pc;       // restore pc
-                let return_reg = state.call_stack_frames[sp].return_reg;
+                block_id = shots[shot_idx].interp.call_stack_frames[sp].block_id;
+                pc = shots[shot_idx].interp.call_stack_frames[sp].return_pc;
+                let return_reg = shots[shot_idx].interp.call_stack_frames[sp].return_reg;
                 if return_reg != VOID_RETURN {
                     write_reg(shot_idx, return_reg, read_reg(shot_idx, instr.src0));
                 }

From db84962c9c7cacd7c33638eb73c6555a078e85d1 Mon Sep 17 00:00:00 2001
From: Oscar Puente <oscarpuente@microsoft.com>
Date: Mon, 27 Apr 2026 18:54:18 -0700
Subject: [PATCH 14/14] add int overflow check

---
 source/pip/qsharp/_adaptive_pass.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/source/pip/qsharp/_adaptive_pass.py b/source/pip/qsharp/_adaptive_pass.py
index b184e9e28d..f5c731cd76 100644
--- a/source/pip/qsharp/_adaptive_pass.py
+++ b/source/pip/qsharp/_adaptive_pass.py
@@ -221,6 +221,9 @@ def __post_init__(self):
         # their two's-complement representation
         # (e.g. -7 → 0xFFFFFFF9 for 32-bit, 0xFFFFFFFFFFFFFFF9 for 64-bit).
         mask = (1 << self.bits) - 1
+        min_val = -(1 << (self.bits - 1))
+        if self.val < min_val or self.val > mask:
+            raise ValueError(f"Value {self.val} does not fit in {self.bits} bits")
         self.val = self.val & mask