From 3e2e893ff8da73c918229bbcfdb924d11c9b41b3 Mon Sep 17 00:00:00 2001
From: zanderjiang <zander.jiang@gmail.com>
Date: Mon, 10 Nov 2025 13:03:11 -0500
Subject: [PATCH 01/10] feat: add FFI prompt tests with environment-based API
 keys

---
 .gitignore                                    |   2 +
 flashinfer_bench/ffi/ffi_prompt.py            | 647 ++++++++++++++++++
 flashinfer_bench/ffi/load_inline.py           |  45 ++
 .../ffi/test_existing_prompt_sol.py           | 234 +++++++
 flashinfer_bench/ffi/test_prompt.py           | 283 ++++++++
 .../test_results/claude-opus-4-1-20250805.txt | 124 ++++
 .../ffi/test_results/gpt-5-2025-08-07.txt     | 111 +++
 .../test_results/gpt-5-mini-2025-08-07.txt    | 129 ++++
 flashinfer_bench/ffi/test_results/o3.txt      | 141 ++++
 .../ffi/test_results/o4-mini-2025-04-16.txt   | 122 ++++
 10 files changed, 1838 insertions(+)
 create mode 100644 flashinfer_bench/ffi/ffi_prompt.py
 create mode 100644 flashinfer_bench/ffi/load_inline.py
 create mode 100644 flashinfer_bench/ffi/test_existing_prompt_sol.py
 create mode 100644 flashinfer_bench/ffi/test_prompt.py
 create mode 100644 flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt
 create mode 100644 flashinfer_bench/ffi/test_results/gpt-5-2025-08-07.txt
 create mode 100644 flashinfer_bench/ffi/test_results/gpt-5-mini-2025-08-07.txt
 create mode 100644 flashinfer_bench/ffi/test_results/o3.txt
 create mode 100644 flashinfer_bench/ffi/test_results/o4-mini-2025-04-16.txt

diff --git a/.gitignore b/.gitignore
index 1055077..35ca3cb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -47,3 +47,5 @@ web/**/public
 
 tmp/
 AGENTS.md
+
+.env
\ No newline at end of file
diff --git a/flashinfer_bench/ffi/ffi_prompt.py b/flashinfer_bench/ffi/ffi_prompt.py
new file mode 100644
index 0000000..0a9db3e
--- /dev/null
+++ b/flashinfer_bench/ffi/ffi_prompt.py
@@ -0,0 +1,647 @@
+FFI_PROMPT_SIMPLE = """
+Use TVM FFI format for your generated kernel host function and bindings
+
+Use the following headers:
+#include <tvm/ffi/container/tensor.h>   // TensorView: tensor arguments
+#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
+#include <tvm/ffi/error.h>              // TVM_FFI_ICHECK, TVM_FFI_THROW
+
+Include when using CUDA / env-managed streams or allocators:
+#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream, TVMFFIEnvTensorAlloc
+
+# TVM FFI API Reference
+
+## TensorView (tvm/ffi/container/tensor.h)
+
+**Purpose:** Non-owning view of tensor data. Use `tvm::ffi::TensorView` for function parameters.
+
+### Essential Methods
+```cpp
+void* data_ptr() const               // Raw data pointer
+DLDevice device() const              // Device info (.device_type, .device_id)
+DLDataType dtype() const             // Data type (.code, .bits, .lanes)
+int32_t ndim() const                 // Number of dimensions
+int64_t size(int64_t idx) const      // Size of dimension idx (negative idx: count from end)
+int64_t numel() const                // Total number of elements
+bool IsContiguous() const            // Check if memory is contiguous
+```
+
+### ShapeView Access
+```cpp
+ShapeView shape() const              // Get shape (array-like)
+ShapeView strides() const            // Get strides (array-like)
+
+// ShapeView can be indexed like an array:
+// tensor.shape()[0], tensor.shape()[1], etc.
+```
+
+### Device Type Constants (DLDevice.device_type)
+```cpp
+kDLCPU = 1          // CPU
+kDLCUDA = 2         // CUDA GPU
+kDLCUDAHost = 3     // CUDA pinned memory
+kDLROCM = 10        // ROCm/HIP
+```
+
+### Data Type Constants (DLDataType)
+```cpp
+// DLDataType has: uint8_t code, uint8_t bits, uint16_t lanes
+// Common .code values:
+kDLInt = 0          // Signed integer
+kDLUInt = 1         // Unsigned integer
+kDLFloat = 2        // IEEE floating point
+kDLBfloat = 4       // Brain floating point
+
+// Example: float32 has code=2 (kDLFloat), bits=32, lanes=1
+// Example: half/fp16 has code=2, bits=16, lanes=1
+```
+
+## Function Export (tvm/ffi/function.h)
+
+### Export Macro
+```cpp
+// Use this to export your C++ function for FFI
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(export_name, cpp_function)
+
+// Example:
+void MyKernel(tvm::ffi::TensorView a, tvm::ffi::TensorView b) { ... }
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(my_kernel, MyKernel);
+```
+
+**Supported function signatures:**
+- `void func(TensorView t1, TensorView t2, ...)`
+- `void func(TensorView t, int64_t size, float alpha, ...)`
+- `int64_t func(TensorView t)`
+- Any combination of: `TensorView`, `int32_t`, `int64_t`, `float`, `double`, `bool`, `std::string`
+
+## Error Handling (tvm/ffi/error.h)
+
+### Throwing Errors
+```cpp
+// Throw with custom error kind
+TVM_FFI_THROW(ValueError) << "Invalid input: " << x;
+TVM_FFI_THROW(RuntimeError) << "CUDA error: " << cudaGetErrorString(err);
+TVM_FFI_THROW(TypeError) << "Expected float32, got int32";
+```
+
+### Assertions (for internal logic errors)
+```cpp
+TVM_FFI_ICHECK(condition) << "message"           // General check
+TVM_FFI_ICHECK_EQ(x, y) << "x must equal y"      // x == y
+TVM_FFI_ICHECK_NE(x, y) << "x must not equal y"  // x != y
+TVM_FFI_ICHECK_LT(x, y) << "x must be less than y"     // x < y
+TVM_FFI_ICHECK_LE(x, y) << "x must be at most y"       // x <= y
+TVM_FFI_ICHECK_GT(x, y) << "x must be greater than y"  // x > y
+TVM_FFI_ICHECK_GE(x, y) << "x must be at least y"      // x >= y
+```
+
+### User Input Validation (use TVM_FFI_THROW instead)
+```cpp
+// For user-facing errors, use TVM_FFI_THROW with appropriate error kind:
+if (x.ndim() != 2) {
+  TVM_FFI_THROW(ValueError) << "Expected 2D tensor, got " << x.ndim() << "D";
+}
+if (x.dtype().code != kDLFloat || x.dtype().bits != 32) {
+  TVM_FFI_THROW(TypeError) << "Expected float32 dtype";
+}
+```
+
+## CUDA Stream Management (tvm/ffi/extra/c_env_api.h)
+
+```cpp
+// Get the current CUDA stream for a device
+TVMFFIStreamHandle TVMFFIEnvGetStream(int32_t device_type, int32_t device_id)
+
+// Usage:
+DLDevice dev = tensor.device();
+cudaStream_t stream = static_cast<cudaStream_t>(
+    TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+
+// Launch kernel on the stream:
+my_kernel<<<blocks, threads, 0, stream>>>(...);
+```
+
+# Common Patterns
+
+## Dtype Validation
+```cpp
+void MyKernel(tvm::ffi::TensorView x) {
+  DLDataType dt = x.dtype();
+  TVM_FFI_ICHECK_EQ(dt.code, kDLFloat) << "Expected float dtype";
+  TVM_FFI_ICHECK_EQ(dt.bits, 32) << "Expected 32-bit dtype";
+  // Or use TVM_FFI_THROW for user-facing validation
+}
+```
+
+## Shape Validation
+```cpp
+void MatMul(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::TensorView c) {
+  TVM_FFI_ICHECK_EQ(a.ndim(), 2) << "a must be 2D";
+  TVM_FFI_ICHECK_EQ(b.ndim(), 2) << "b must be 2D";
+  TVM_FFI_ICHECK_EQ(a.size(1), b.size(0)) << "Shape mismatch for matmul";
+  TVM_FFI_ICHECK_EQ(c.size(0), a.size(0)) << "Output shape mismatch";
+  TVM_FFI_ICHECK_EQ(c.size(1), b.size(1)) << "Output shape mismatch";
+}
+```
+
+## Multi-dimensional Indexing
+```cpp
+void Process2D(tvm::ffi::TensorView x) {
+  int64_t height = x.size(0);
+  int64_t width = x.size(1);
+  float* data = static_cast<float*>(x.data_ptr());
+  
+  // If contiguous (row-major), access as: data[i * width + j]
+  TVM_FFI_ICHECK(x.IsContiguous()) << "Expected contiguous tensor";
+}
+```
+
+## Device-specific Kernels
+```cpp
+void MyKernel(tvm::ffi::TensorView x) {
+  DLDevice dev = x.device();
+  if (dev.device_type == kDLCUDA) {
+    // Launch CUDA kernel
+  } else if (dev.device_type == kDLCPU) {
+    // CPU implementation
+  } else {
+    TVM_FFI_THROW(RuntimeError) << "Unsupported device type: " << dev.device_type;
+  }
+}
+```
+
+Example Usage of FFI binding:
+```cpp
+// File: add_one_cuda.cu
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/error.h>
+
+namespace my_kernels {
+
+__global__ void AddOneKernel(float* x, float* y, int n) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = x[idx] + 1.0f;
+  }
+}
+
+void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
+  // Input validation
+  TVM_FFI_ICHECK_EQ(x.ndim(), 1) << "x must be 1D";
+  TVM_FFI_ICHECK_EQ(y.ndim(), 1) << "y must be 1D";
+  TVM_FFI_ICHECK_EQ(x.size(0), y.size(0)) << "Shape mismatch";
+  
+  // Get data pointers
+  float* x_data = static_cast<float*>(x.data_ptr());
+  float* y_data = static_cast<float*>(y.data_ptr());
+  int64_t n = x.size(0);
+  
+  // Get CUDA stream from environment
+  DLDevice dev = x.device();
+  cudaStream_t stream = static_cast<cudaStream_t>(
+      TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+  
+  // Launch kernel
+  int64_t threads = 256;
+  int64_t blocks = (n + threads - 1) / threads;
+  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
+}
+
+// Export the function with name "add_one_cuda"
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cuda, AddOne);
+
+}  // namespace my_kernels
+```
+"""
+
+FFI_PROMPT = """
+You should use TVM FFI bindings for your code.
+
+# Required Headers
+
+```cpp
+#include <tvm/ffi/container/tensor.h>   // Tensor, TensorView
+#include <tvm/ffi/function.h>           // Function export macros
+#include <tvm/ffi/error.h>              // Error handling
+#include <tvm/ffi/extra/c_env_api.h>    // Environment APIs (streams, allocators)
+```
+
+# Complete Example: CUDA Kernel Binding
+
+```cpp
+// File: add_one_cuda.cu
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/error.h>
+
+namespace my_kernels {
+
+__global__ void AddOneKernel(float* x, float* y, int n) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = x[idx] + 1.0f;
+  }
+}
+
+void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
+  // Input validation
+  TVM_FFI_ICHECK_EQ(x.ndim(), 1) << "x must be 1D";
+  TVM_FFI_ICHECK_EQ(y.ndim(), 1) << "y must be 1D";
+  TVM_FFI_ICHECK_EQ(x.size(0), y.size(0)) << "Shape mismatch";
+  
+  // Get data pointers
+  float* x_data = static_cast<float*>(x.data_ptr());
+  float* y_data = static_cast<float*>(y.data_ptr());
+  int64_t n = x.size(0);
+  
+  // Get CUDA stream from environment
+  DLDevice dev = x.device();
+  cudaStream_t stream = static_cast<cudaStream_t>(
+      TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+  
+  // Launch kernel
+  int64_t threads = 256;
+  int64_t blocks = (n + threads - 1) / threads;
+  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
+}
+
+// Export the function with name "add_one_cuda"
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cuda, AddOne);
+
+}  // namespace my_kernels
+```
+
+# TVM FFI API Documentation
+
+## 1. Tensor Container API (tvm/ffi/container/tensor.h)
+
+### Tensor Class
+A managed n-dimensional array with reference counting.
+
+**Methods:**
+- `void* data_ptr() const` - Returns raw data pointer
+- `DLDevice device() const` - Returns device info (`.device_type`, `.device_id`)
+- `int32_t ndim() const` - Returns number of dimensions  
+- `DLDataType dtype() const` - Returns data type (`.code`, `.bits`, `.lanes`)
+- `ShapeView shape() const` - Returns shape array (indexable: `shape()[0]`, `shape()[1]`, ...)
+- `ShapeView strides() const` - Returns strides array
+- `int64_t size(int64_t idx) const` - Returns size of dimension at idx (negative idx counts from end)
+- `int64_t stride(int64_t idx) const` - Returns stride of dimension at idx
+- `int64_t numel() const` - Returns total number of elements
+- `uint64_t byte_offset() const` - Returns byte offset
+- `bool IsContiguous() const` - Checks if tensor memory is contiguous
+- `bool IsAligned(size_t alignment) const` - Checks if data is aligned to given bytes
+
+**Static Factory Methods:**
+- `static Tensor FromDLPack(DLManagedTensor* tensor, size_t require_alignment = 0, bool require_contiguous = false)`
+- `static Tensor FromDLPackVersioned(DLManagedTensorVersioned* tensor, size_t require_alignment = 0, bool require_contiguous = false)`
+- `template<typename TNDAlloc, typename... ExtraArgs> static Tensor FromNDAlloc(TNDAlloc alloc, ffi::ShapeView shape, DLDataType dtype, DLDevice device, ExtraArgs&&... extra_args)`
+- `static Tensor FromEnvAlloc(int (*env_alloc)(DLTensor*, TVMFFIObjectHandle*), ffi::ShapeView shape, DLDataType dtype, DLDevice device)`
+
+**Conversion Methods:**
+- `DLManagedTensor* ToDLPack() const` - Convert to DLPack managed tensor
+- `DLManagedTensorVersioned* ToDLPackVersioned() const` - Convert to versioned DLPack
+- `const DLTensor* GetDLTensorPtr() const` - Get underlying DLTensor pointer
+
+### TensorView Class  
+Non-owning lightweight view of a Tensor. Kernel entrypoints should use `tvm::ffi::TensorView` (or `const TensorView&`) for tensor inputs/outputs.
+
+**Constructors:**
+- `TensorView(const Tensor& tensor)` - From Tensor
+- `TensorView(const DLTensor* tensor)` - From DLTensor pointer
+
+**Methods (same interface as Tensor):**
+- `void* data_ptr() const`
+- `DLDevice device() const`
+- `int32_t ndim() const`
+- `DLDataType dtype() const`
+- `ShapeView shape() const`
+- `ShapeView strides() const`
+- `int64_t size(int64_t idx) const`
+- `int64_t stride(int64_t idx) const`
+- `int64_t numel() const`
+- `uint64_t byte_offset() const`
+- `bool IsContiguous() const`
+
+### Utility Functions
+- `bool IsContiguous(const DLTensor& arr)` - Check if DLTensor is contiguous
+- `bool IsAligned(const DLTensor& arr, size_t alignment)` - Check alignment
+- `bool IsDirectAddressDevice(const DLDevice& device)` - Check if device uses direct addressing
+- `size_t GetDataSize(size_t numel, DLDataType dtype)` - Calculate bytes for packed data
+- `size_t GetDataSize(const DLTensor& arr)` - Calculate bytes in DLTensor
+- `size_t GetDataSize(const Tensor& tensor)` - Calculate bytes in Tensor
+- `size_t GetDataSize(const TensorView& tensor)` - Calculate bytes in TensorView
+
+### Device Type Constants (DLDevice.device_type)
+```cpp
+kDLCPU = 1          // CPU
+kDLCUDA = 2         // CUDA GPU
+kDLCUDAHost = 3     // CUDA pinned memory
+kDLROCM = 10        // ROCm/HIP
+```
+
+### Data Type Constants (DLDataType)
+```cpp
+// DLDataType has: uint8_t code, uint8_t bits, uint16_t lanes
+kDLInt = 0          // Signed integer
+kDLUInt = 1         // Unsigned integer
+kDLFloat = 2        // IEEE floating point
+kDLBfloat = 4       // Brain floating point
+
+// Example: float32 has code=2 (kDLFloat), bits=32, lanes=1
+// Example: half/fp16 has code=2, bits=16, lanes=1
+```
+
+## 2. Function API (tvm/ffi/function.h)
+
+### Function Class
+Type-erased callable object.
+
+**Constructors:**
+- `Function(std::nullptr_t)` - Null constructor
+- `template<typename TCallable> Function(TCallable packed_call)` - From callable (legacy)
+
+**Static Factory Methods:**
+- `template<typename TCallable> static Function FromPacked(TCallable packed_call)` - From packed signature: `void(const AnyView*, int32_t, Any*)` or `void(PackedArgs, Any*)`
+- `template<typename TCallable> static Function FromTyped(TCallable callable)` - From typed C++ function
+- `template<typename TCallable> static Function FromTyped(TCallable callable, std::string name)` - With name for error messages
+- `static Function FromExternC(void* self, TVMFFISafeCallType safe_call, void (*deleter)(void* self))` - From C callback
+
+**Global Registry:**
+- `static std::optional<Function> GetGlobal(std::string_view name)`
+- `static std::optional<Function> GetGlobal(const std::string& name)`
+- `static std::optional<Function> GetGlobal(const String& name)`
+- `static std::optional<Function> GetGlobal(const char* name)`
+- `static Function GetGlobalRequired(std::string_view name)` - Throws if not found
+- `static Function GetGlobalRequired(const std::string& name)`
+- `static Function GetGlobalRequired(const String& name)`
+- `static Function GetGlobalRequired(const char* name)`
+- `static void SetGlobal(std::string_view name, Function func, bool override = false)`
+- `static std::vector<String> ListGlobalNames()`
+- `static void RemoveGlobal(const String& name)`
+
+**Invocation:**
+- `template<typename... Args> Any operator()(Args&&... args) const` - Call with unpacked args
+- `void CallPacked(const AnyView* args, int32_t num_args, Any* result) const`
+- `void CallPacked(PackedArgs args, Any* result) const`
+- `template<typename... Args> static Any InvokeExternC(void* handle, TVMFFISafeCallType safe_call, Args&&... args)`
+
+### TypedFunction<R(Args...)> Class
+Type-safe wrapper around Function.
+
+**Constructors:**
+- `TypedFunction()` - Default
+- `TypedFunction(std::nullptr_t)`
+- `TypedFunction(Function packed)` - From Function
+- `template<typename FLambda> TypedFunction(FLambda typed_lambda)` - From lambda
+- `template<typename FLambda> TypedFunction(FLambda typed_lambda, std::string name)` - With name
+
+**Methods:**
+- `R operator()(Args... args) const` - Type-safe invocation
+- `operator Function() const` - Convert to Function
+- `const Function& packed() const&` - Get internal Function
+- `Function&& packed() &&` - Move internal Function
+- `static std::string TypeSchema()` - Get JSON type schema
+
+### PackedArgs Class
+Represents packed arguments.
+
+**Constructor:**
+- `PackedArgs(const AnyView* data, int32_t size)`
+
+**Methods:**
+- `int size() const` - Number of arguments
+- `const AnyView* data() const` - Raw argument array
+- `PackedArgs Slice(int begin, int end = -1) const` - Get subset
+- `AnyView operator[](int i) const` - Access argument
+- `template<typename... Args> static void Fill(AnyView* data, Args&&... args)` - Pack arguments
+
+### Export Macro
+```cpp
+// Export typed C++ function for FFI
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(export_name, cpp_function)
+
+// Example:
+void MyKernel(tvm::ffi::TensorView a, tvm::ffi::TensorView b) { ... }
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(my_kernel, MyKernel);
+```
+
+**Supported function signatures:**
+- `void func(TensorView t1, TensorView t2, ...)`
+- `void func(TensorView t, int64_t size, float alpha, ...)`
+- `int64_t func(TensorView t)`
+- Any combination of: `TensorView`, `int32_t`, `int64_t`, `float`, `double`, `bool`, `std::string`
+
+## 3. Error Handling API (tvm/ffi/error.h)
+
+### Error Class
+Exception object with stack trace.
+
+**Constructor:**
+- `Error(std::string kind, std::string message, std::string backtrace)`
+- `Error(std::string kind, std::string message, const TVMFFIByteArray* backtrace)`
+
+**Methods:**
+- `std::string kind() const` - Error category
+- `std::string message() const` - Error description
+- `std::string backtrace() const` - Stack trace
+- `std::string TracebackMostRecentCallLast() const` - Python-style traceback
+- `const char* what() const noexcept override` - Standard exception interface
+- `void UpdateBacktrace(const TVMFFIByteArray* backtrace_str, int32_t update_mode)` - Modify traceback
+
+### EnvErrorAlreadySet Exception
+Thrown when error exists in frontend environment (e.g., Python interrupt).
+
+**Usage:**
+```cpp
+void LongRunningFunction() {
+  if (TVMFFIEnvCheckSignals() != 0) {
+    throw ::tvm::ffi::EnvErrorAlreadySet();
+  }
+  // do work here
+}
+```
+
+### Error Macros
+```cpp
+// Throw with backtrace
+TVM_FFI_THROW(ErrorKind) << message
+
+// Log to stderr and throw (for startup functions)
+TVM_FFI_LOG_AND_THROW(ErrorKind) << message
+
+// Check C function return code
+TVM_FFI_CHECK_SAFE_CALL(func)
+
+// Wrap C++ code for C API
+TVM_FFI_SAFE_CALL_BEGIN();
+// c++ code region here
+TVM_FFI_SAFE_CALL_END();
+```
+
+### Assertion Macros
+```cpp
+TVM_FFI_ICHECK(condition) << "message"           // General check
+TVM_FFI_CHECK(condition, ErrorKind) << "message" // Custom error type
+TVM_FFI_ICHECK_EQ(x, y) << "x must equal y"      // x == y
+TVM_FFI_ICHECK_NE(x, y) << "x must not equal y"  // x != y
+TVM_FFI_ICHECK_LT(x, y) << "x must be less than y"     // x < y
+TVM_FFI_ICHECK_LE(x, y) << "x must be at most y"       // x <= y
+TVM_FFI_ICHECK_GT(x, y) << "x must be greater than y"  // x > y
+TVM_FFI_ICHECK_GE(x, y) << "x must be at least y"      // x >= y
+TVM_FFI_ICHECK_NOTNULL(ptr) << "ptr must not be null"  // ptr != nullptr
+```
+
+**Common Error Kinds:**
+- `ValueError` - Invalid argument values
+- `TypeError` - Type mismatch
+- `RuntimeError` - Runtime failures (CUDA errors, etc.)
+- `InternalError` - Internal logic errors
+
+### Utility Functions
+- `int32_t TypeKeyToIndex(std::string_view type_key)` - Get type index from key
+
+## 4. Environment API (tvm/ffi/extra/c_env_api.h)
+
+### Stream Management
+```cpp
+// Set current stream for a device
+int TVMFFIEnvSetStream(int32_t device_type, int32_t device_id, 
+                       TVMFFIStreamHandle stream,
+                       TVMFFIStreamHandle* opt_out_original_stream)
+
+// Get current stream for a device
+TVMFFIStreamHandle TVMFFIEnvGetStream(int32_t device_type, int32_t device_id)
+
+// Usage example:
+DLDevice dev = tensor.device();
+cudaStream_t stream = static_cast<cudaStream_t>(
+    TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+```
+
+### Tensor Allocation  
+```cpp
+// Set DLPack allocator (TLS or global)
+int TVMFFIEnvSetDLPackManagedTensorAllocator(
+    DLPackManagedTensorAllocator allocator,
+    int write_to_global_context,
+    DLPackManagedTensorAllocator* opt_out_original_allocator)
+
+// Get current allocator
+DLPackManagedTensorAllocator TVMFFIEnvGetDLPackManagedTensorAllocator()
+
+// Allocate tensor using environment allocator
+int TVMFFIEnvTensorAlloc(DLTensor* prototype, TVMFFIObjectHandle* out)
+
+// Usage with FromEnvAlloc:
+ffi::Tensor tensor = ffi::Tensor::FromEnvAlloc(
+    TVMFFIEnvTensorAlloc, shape, dtype, device);
+```
+
+### Module & Symbol Management
+```cpp
+// Lookup function from module imports
+int TVMFFIEnvModLookupFromImports(TVMFFIObjectHandle library_ctx,
+                                  const char* func_name,
+                                  TVMFFIObjectHandle* out)
+
+// Register context symbol (available when library is loaded)
+int TVMFFIEnvModRegisterContextSymbol(const char* name, void* symbol)
+
+// Register system library symbol
+int TVMFFIEnvModRegisterSystemLibSymbol(const char* name, void* symbol)
+
+// Register C API symbol
+int TVMFFIEnvRegisterCAPI(const char* name, void* symbol)
+```
+
+### Utilities
+```cpp
+// Check for environment signals (e.g., Python Ctrl+C)
+int TVMFFIEnvCheckSignals()
+```
+
+### Type Definitions
+```cpp
+typedef void* TVMFFIStreamHandle  // Stream handle type
+```
+
+# Common Patterns
+
+## Dtype Validation
+```cpp
+void MyKernel(tvm::ffi::TensorView x) {
+  DLDataType dt = x.dtype();
+  TVM_FFI_ICHECK_EQ(dt.code, kDLFloat) << "Expected float dtype";
+  TVM_FFI_ICHECK_EQ(dt.bits, 32) << "Expected 32-bit dtype";
+  
+  // Or for user-facing errors:
+  if (dt.code != kDLFloat || dt.bits != 32) {
+    TVM_FFI_THROW(TypeError) << "Expected float32, got "
+                             << "code=" << dt.code << " bits=" << dt.bits;
+  }
+}
+```
+
+## Shape Validation
+```cpp
+void MatMul(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::TensorView c) {
+  TVM_FFI_ICHECK_EQ(a.ndim(), 2) << "a must be 2D";
+  TVM_FFI_ICHECK_EQ(b.ndim(), 2) << "b must be 2D";
+  TVM_FFI_ICHECK_EQ(a.size(1), b.size(0)) << "Shape mismatch for matmul";
+  TVM_FFI_ICHECK_EQ(c.size(0), a.size(0)) << "Output shape mismatch";
+  TVM_FFI_ICHECK_EQ(c.size(1), b.size(1)) << "Output shape mismatch";
+}
+```
+
+## Multi-dimensional Indexing
+```cpp
+void Process2D(tvm::ffi::TensorView x) {
+  int64_t height = x.size(0);
+  int64_t width = x.size(1);
+  float* data = static_cast<float*>(x.data_ptr());
+  
+  // If contiguous (row-major), access as: data[i * width + j]
+  TVM_FFI_ICHECK(x.IsContiguous()) << "Expected contiguous tensor";
+  
+  // With strides:
+  int64_t stride0 = x.stride(0);
+  int64_t stride1 = x.stride(1);
+  // Access: data[i * stride0 + j * stride1]
+}
+```
+
+## Device-specific Kernels
+```cpp
+void MyKernel(tvm::ffi::TensorView x) {
+  DLDevice dev = x.device();
+  if (dev.device_type == kDLCUDA) {
+    // Launch CUDA kernel
+    cudaStream_t stream = static_cast<cudaStream_t>(
+        TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+    // kernel<<<blocks, threads, 0, stream>>>(...);
+  } else if (dev.device_type == kDLCPU) {
+    // CPU implementation
+  } else {
+    TVM_FFI_THROW(RuntimeError) << "Unsupported device type: " << dev.device_type;
+  }
+}
+```
+
+## Allocating New Tensors
+```cpp
+void CreateOutput(tvm::ffi::TensorView input) {
+  // Create output tensor with same device and dtype as input
+  ffi::ShapeView shape = input.shape();
+  DLDataType dtype = input.dtype();
+  DLDevice device = input.device();
+  
+  // Use environment allocator
+  ffi::Tensor output = ffi::Tensor::FromEnvAlloc(
+      TVMFFIEnvTensorAlloc, shape, dtype, device);
+}
+```
+"""
\ No newline at end of file
diff --git a/flashinfer_bench/ffi/load_inline.py b/flashinfer_bench/ffi/load_inline.py
new file mode 100644
index 0000000..8a70492
--- /dev/null
+++ b/flashinfer_bench/ffi/load_inline.py
@@ -0,0 +1,45 @@
+import torch
+from tvm_ffi import Module
+import tvm_ffi.cpp
+from pathlib import Path
+import re
+
+cuda_source = """
+// File: compile/add_one_cuda.cu
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+
+namespace tvm_ffi_example_cuda {
+
+__global__ void AddOneKernel(float* x, float* y, int n) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = x[idx] + 1;
+  }
+}
+
+void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
+  int64_t n = x.size(0);
+  float* x_data = static_cast<float*>(x.data_ptr());
+  float* y_data = static_cast<float*>(y.data_ptr());
+  int64_t threads = 256;
+  int64_t blocks = (n + threads - 1) / threads;
+  cudaStream_t stream =
+      static_cast<cudaStream_t>(TVMFFIEnvGetStream(x.device().device_type, x.device().device_id));
+  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cuda, tvm_ffi_example_cuda::AddOne);
+}  // namespace tvm_ffi_example_cuda
+"""
+
+def main():
+    mod: Module = tvm_ffi.cpp.load_inline(
+        name='add_one_cuda',
+        cuda_sources=cuda_source,
+    )
+    print("Compilation successful")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/flashinfer_bench/ffi/test_existing_prompt_sol.py b/flashinfer_bench/ffi/test_existing_prompt_sol.py
new file mode 100644
index 0000000..ef13b94
--- /dev/null
+++ b/flashinfer_bench/ffi/test_existing_prompt_sol.py
@@ -0,0 +1,234 @@
+"""
+Test script to re-test previously generated CUDA kernels from test_results directory.
+Reads .txt files containing generated code and runs the same test cases.
+"""
+
+import torch
+from tvm_ffi import Module
+import tvm_ffi.cpp
+from pathlib import Path
+import re
+
+
+def extract_code_from_file(file_path: Path) -> tuple[str, str]:
+    """Extract model name and generated code from a test result file.
+    
+    Returns:
+        tuple: (model_name, code)
+    """
+    with open(file_path, 'r') as f:
+        content = f.read()
+    
+    # Extract model name
+    model_match = re.search(r'Model: (.+?)\n', content)
+    model_name = model_match.group(1) if model_match else file_path.stem
+    
+    # Extract code between "Generated Code:" and "Test Results:"
+    code_pattern = r'Generated Code:\n={80}\n(.*?)\n(?:Test Results:|$)'
+    code_match = re.search(code_pattern, content, re.DOTALL)
+    
+    if code_match:
+        code = code_match.group(1).strip()
+    else:
+        # Try alternative pattern - just get everything after "Generated Code:"
+        alt_pattern = r'Generated Code:\n={80}\n(.*)'
+        alt_match = re.search(alt_pattern, content, re.DOTALL)
+        if alt_match:
+            # Get everything until we see "Test Results:" or end of file
+            full_content = alt_match.group(1)
+            if 'Test Results:' in full_content:
+                code = full_content.split('Test Results:')[0].strip()
+            else:
+                code = full_content.strip()
+        else:
+            raise ValueError(f"Could not extract code from {file_path}")
+    
+    return model_name, code
+
+
+def test_kernel(mod: Module, test_name: str) -> bool:
+    """Test the generated kernel with simple test cases."""
+    try:
+        print(f"\n  Running {test_name}...")
+        
+        if test_name == "test_small":
+            # Test 1: Small tensor
+            a = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32, device='cuda')
+            b = torch.tensor([10.0, 20.0, 30.0, 40.0, 50.0], dtype=torch.float32, device='cuda')
+            c = torch.empty_like(a)
+            
+            mod.elementwise_add(a, b, c)
+            expected = a + b
+            
+            torch.testing.assert_close(c, expected)
+            print(f"  ✓ {test_name} passed")
+            return True
+            
+        elif test_name == "test_large":
+            # Test 2: Larger tensor
+            n = 10000
+            a = torch.randn(n, dtype=torch.float32, device='cuda')
+            b = torch.randn(n, dtype=torch.float32, device='cuda')
+            c = torch.empty_like(a)
+            
+            mod.elementwise_add(a, b, c)
+            expected = a + b
+            
+            torch.testing.assert_close(c, expected)
+            print(f"  ✓ {test_name} passed")
+            return True
+            
+    except Exception as e:
+        print(f"  ✗ {test_name} failed: {e}")
+        return False
+
+
+def update_test_results_in_file(file_path: Path, model_name: str, code: str, results: dict):
+    """Update the test results section in a result file, overwriting old results."""
+    # Build the new content
+    content = f"Model: {model_name}\n"
+    content += "=" * 80 + "\n\n"
+    content += "Generated Code:\n"
+    content += "=" * 80 + "\n"
+    content += code + "\n\n"
+    content += "Test Results:\n"
+    content += "=" * 80 + "\n"
+    content += f"Compilation: {results['compilation']}\n"
+    
+    if "error" in results:
+        content += f"Error: {results['error']}\n"
+    else:
+        content += f"Test 1 (small tensor): {results['test_small']}\n"
+        content += f"Test 2 (large tensor): {results['test_large']}\n"
+    
+    # Write back to file
+    with open(file_path, 'w') as f:
+        f.write(content)
+
+
+def test_saved_code(file_path: Path):
+    """Test code from a saved result file."""
+    print(f"\n{'='*80}")
+    print(f"Testing: {file_path.name}")
+    print(f"{'='*80}")
+    
+    try:
+        # Extract code
+        model_name, cuda_code = extract_code_from_file(file_path)
+        print(f"Model: {model_name}")
+        print(f"Extracted {len(cuda_code)} characters of code")
+        
+        # Try to compile
+        print("Attempting to compile with tvm_ffi...")
+        try:
+            mod: Module = tvm_ffi.cpp.load_inline(
+                name=f'elementwise_add_{model_name.replace("-", "_")}',
+                cuda_sources=cuda_code
+            )
+            print("✓ Compilation successful!")
+            
+            # Run tests
+            print("\nRunning tests...")
+            test1_passed = test_kernel(mod, "test_small")
+            test2_passed = test_kernel(mod, "test_large")
+            
+            all_passed = test1_passed and test2_passed
+            status = "✓ ALL TESTS PASSED" if all_passed else "✗ SOME TESTS FAILED"
+            print(f"\n{status}")
+            
+            # Update the file with new test results
+            update_test_results_in_file(file_path, model_name, cuda_code, {
+                "compilation": "SUCCESS",
+                "test_small": "PASS" if test1_passed else "FAIL",
+                "test_large": "PASS" if test2_passed else "FAIL"
+            })
+            
+            return {
+                "file": file_path.name,
+                "model": model_name,
+                "compilation": "success",
+                "test_small": test1_passed,
+                "test_large": test2_passed,
+                "all_passed": all_passed
+            }
+            
+        except Exception as e:
+            print(f"✗ Compilation failed: {e}")
+            
+            # Update the file with compilation failure
+            update_test_results_in_file(file_path, model_name, cuda_code, {
+                "compilation": "FAILED",
+                "error": str(e)
+            })
+            
+            return {
+                "file": file_path.name,
+                "model": model_name,
+                "compilation": "failed",
+                "error": str(e),
+                "all_passed": False
+            }
+            
+    except Exception as e:
+        print(f"✗ Error processing file: {e}")
+        return {
+            "file": file_path.name,
+            "compilation": "error",
+            "error": str(e),
+            "all_passed": False
+        }
+
+
+def main():
+    """Test all saved code from test_results directory."""
+    
+    # Get test_results directory
+    test_results_dir = Path(__file__).parent / "test_results"
+    
+    if not test_results_dir.exists():
+        print(f"Error: {test_results_dir} does not exist")
+        return
+    
+    # Find all .txt files
+    txt_files = sorted(test_results_dir.glob("*.txt"))
+    
+    if not txt_files:
+        print(f"No .txt files found in {test_results_dir}")
+        return
+    
+    print("="*80)
+    print("Re-testing Saved LLM Generated Code")
+    print("="*80)
+    print(f"Files to test: {len(txt_files)}")
+    print(f"Directory: {test_results_dir}")
+    
+    results = []
+    
+    for idx, file_path in enumerate(txt_files, 1):
+        print(f"\n[{idx}/{len(txt_files)}] Testing {file_path.name}...")
+        result = test_saved_code(file_path)
+        results.append(result)
+    
+    # Print summary
+    print(f"\n{'='*80}")
+    print("FINAL SUMMARY")
+    print(f"{'='*80}")
+    
+    for result in results:
+        file_name = result['file']
+        status = "✓ PASS" if result.get('all_passed', False) else "✗ FAIL"
+        print(f"{file_name:40} {status}")
+        if 'error' in result:
+            error_msg = result['error']
+            # Truncate long errors
+            if len(error_msg) > 100:
+                error_msg = error_msg[:100] + "..."
+            print(f"  Error: {error_msg}")
+    
+    total_passed = sum(1 for r in results if r.get('all_passed', False))
+    print(f"\nTotal: {total_passed}/{len(results)} files passed all tests")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/flashinfer_bench/ffi/test_prompt.py b/flashinfer_bench/ffi/test_prompt.py
new file mode 100644
index 0000000..fa87ba9
--- /dev/null
+++ b/flashinfer_bench/ffi/test_prompt.py
@@ -0,0 +1,283 @@
+"""
+Test script to evaluate LLMs' ability to generate CUDA kernels with TVM FFI bindings.
+Tests elementwise add kernel generation across multiple models.
+"""
+
+import torch
+from tvm_ffi import Module
+import tvm_ffi.cpp
+from pathlib import Path
+import re
+import os
+from dotenv import load_dotenv
+
+from ffi_prompt import FFI_PROMPT_SIMPLE
+
+load_dotenv()
+
+
+# System prompt for elementwise add
+ELEMENTWISE_ADD_PROMPT = """Write a CUDA kernel function that performs elementwise addition of two tensors.
+
+The function should:
+- Take three TensorView arguments: input tensor a, input tensor b, and output tensor c
+- Compute c[i] = a[i] + b[i] for all elements
+- Support 1D float32 tensors
+- Use proper input validation
+- Export the function with name "elementwise_add"
+"""
+
+
+def get_model_config(model_name: str):
+    """Get API configuration for a given model."""
+    # OpenAI models
+    openai_key = os.getenv("OPENAI_API_KEY")
+    # Anthropic models
+    anthropic_key = os.getenv("ANTHROPIC_API_KEY")
+    
+    if model_name in ["gpt-5-2025-08-07", "o3", "gpt-5-mini-2025-08-07", "o4-mini-2025-04-16"]:
+        return {
+            "provider": "openai",
+            "api_key": openai_key,
+            "model": model_name
+        }
+    elif model_name in ["claude-opus-4-1-20250805", "claude-sonnet-4-5-20250805"]:
+        return {
+            "provider": "anthropic",
+            "api_key": anthropic_key,
+            "model": model_name
+        }
+    else:
+        raise ValueError(f"Unknown model: {model_name}")
+
+
+def call_openai_model(model_name: str, api_key: str, prompt: str) -> str:
+    """Call OpenAI API to generate code."""
+    import openai
+    
+    client = openai.OpenAI(api_key=api_key)
+    
+    if model_name in ["o3", "o4-mini-2025-04-16"]:
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "user", "content": prompt}
+            ],
+            reasoning_effort="high"
+        )
+    else:
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": ELEMENTWISE_ADD_PROMPT},
+                {"role": "user", "content": FFI_PROMPT_SIMPLE}
+            ]
+        )
+    
+    return response.choices[0].message.content
+
+
+def call_anthropic_model(model_name: str, api_key: str, prompt: str) -> str:
+    import anthropic
+    
+    client = anthropic.Anthropic(api_key=api_key)
+    
+    response = client.messages.create(
+        model=model_name,
+        max_tokens=4096,
+        system=ELEMENTWISE_ADD_PROMPT,
+        messages=[
+            {"role": "user", "content": FFI_PROMPT_SIMPLE}
+        ]
+    )
+    
+    return response.content[0].text
+
+
+def extract_cuda_code(response: str) -> str:
+    patterns = [
+        r"```(?:cpp|cuda|c\+\+)\n(.*?)```",
+        r"```\n(.*?)```",
+    ]
+    
+    for pattern in patterns:
+        matches = re.findall(pattern, response, re.DOTALL)
+        if matches:
+            for match in matches:
+                if "TVM_FFI" in match or "TensorView" in match:
+                    return match.strip()
+    
+    return response.strip()
+
+
+def test_kernel(mod: Module, test_name: str) -> bool:
+    """Test the generated kernel with simple test cases."""
+    try:
+        print(f"\n  Running {test_name}...")
+        
+        if test_name == "test_small":
+            # Test 1: Small tensor
+            a = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32, device='cuda')
+            b = torch.tensor([10.0, 20.0, 30.0, 40.0, 50.0], dtype=torch.float32, device='cuda')
+            c = torch.empty_like(a)
+            
+            mod.elementwise_add(a, b, c)
+            expected = a + b
+            
+            torch.testing.assert_close(c, expected)
+            print(f"  ✓ {test_name} passed")
+            return True
+            
+        elif test_name == "test_large":
+            # Test 2: Larger tensor
+            n = 10000
+            a = torch.randn(n, dtype=torch.float32, device='cuda')
+            b = torch.randn(n, dtype=torch.float32, device='cuda')
+            c = torch.empty_like(a)
+            
+            mod.elementwise_add(a, b, c)
+            expected = a + b
+            
+            torch.testing.assert_close(c, expected)
+            print(f"  ✓ {test_name} passed")
+            return True
+            
+    except Exception as e:
+        print(f"  ✗ {test_name} failed: {e}")
+        return False
+
+
+def test_model(model_name: str, output_dir: Path):
+    print(f"\n{'='*80}")
+    print(f"Testing model: {model_name}")
+    print(f"{'='*80}")
+    
+    try:
+        config = get_model_config(model_name)
+        
+        full_prompt = ELEMENTWISE_ADD_PROMPT + "\n\n" + FFI_PROMPT_SIMPLE
+        
+        print(f"Calling {config['provider']} API...")
+        if config["provider"] == "openai":
+            response = call_openai_model(config["model"], config["api_key"], full_prompt)
+        elif config["provider"] == "anthropic":
+            response = call_anthropic_model(config["model"], config["api_key"], full_prompt)
+        else:
+            raise ValueError(f"Unknown provider: {config['provider']}")
+        
+        print(f"Received response from {model_name}")
+        
+        cuda_code = extract_cuda_code(response)
+        print(f"Extracted {len(cuda_code)} characters of code")
+        
+        output_file = output_dir / f"{model_name}.txt"
+        with open(output_file, 'w') as f:
+            f.write(f"Model: {model_name}\n")
+            f.write(f"{'='*80}\n\n")
+            f.write("Generated Code:\n")
+            f.write("="*80 + "\n")
+            f.write(cuda_code)
+            f.write("\n\n")
+        
+        print(f"Saved response to {output_file}")
+        
+        try:
+            mod: Module = tvm_ffi.cpp.load_inline(
+                name=f'elementwise_add_{model_name.replace("-", "_")}',
+                cuda_sources=cuda_code
+            )
+            print("Compilation successful!")
+            
+            # Run tests
+            print("\nRunning tests...")
+            test1_passed = test_kernel(mod, "test_small")
+            test2_passed = test_kernel(mod, "test_large")
+            
+            with open(output_file, 'a') as f:
+                f.write("Test Results:\n")
+                f.write("="*80 + "\n")
+                f.write(f"Compilation: SUCCESS\n")
+                f.write(f"Test 1 (small tensor): {'PASS' if test1_passed else 'FAIL'}\n")
+                f.write(f"Test 2 (large tensor): {'PASS' if test2_passed else 'FAIL'}\n")
+            
+            all_passed = test1_passed and test2_passed
+            status = "ALL TESTS PASSED" if all_passed else "TESTS FAILED"
+            print(f"\n{status}")
+            
+            return {
+                "model": model_name,
+                "compilation": "success",
+                "test_small": test1_passed,
+                "test_large": test2_passed,
+                "all_passed": all_passed
+            }
+            
+        except Exception as e:
+            print(f"Compilation failed: {e}")
+            with open(output_file, 'a') as f:
+                f.write("Test Results:\n")
+                f.write("="*80 + "\n")
+                f.write(f"Compilation: FAILED\n")
+                f.write(f"Error: {str(e)}\n")
+            
+            return {
+                "model": model_name,
+                "compilation": "failed",
+                "error": str(e),
+                "all_passed": False
+            }
+            
+    except Exception as e:
+        print(f"Error testing {model_name}: {e}")
+        return {
+            "model": model_name,
+            "compilation": "error",
+            "error": str(e),
+            "all_passed": False
+        }
+
+
+def main():    
+    models = [
+        "gpt-5-2025-08-07",
+        "o3",
+        "claude-opus-4-1-20250805",
+        "claude-sonnet-4-5-20250805",
+        "gpt-5-mini-2025-08-07",
+        "o4-mini-2025-04-16"
+    ]
+    
+    output_dir = Path(__file__).parent / "test_results"
+    output_dir.mkdir(exist_ok=True)
+    
+    print("="*80)
+    print("Testing LLM FFI Code Generation")
+    print("="*80)
+    print(f"Models to test: {len(models)}")
+    print(f"Output directory: {output_dir}")
+    
+    results = []
+    
+    for idx, model_name in enumerate(models, 1):
+        print(f"\n[{idx}/{len(models)}] Testing {model_name}...")
+        result = test_model(model_name, output_dir)
+        results.append(result)
+    
+    print(f"\n{'='*80}")
+    print("FINAL SUMMARY")
+    print(f"{'='*80}")
+    
+    for result in results:
+        model = result['model']
+        status = "✓ PASS" if result.get('all_passed', False) else "✗ FAIL"
+        print(f"{model:30} {status}")
+        if 'error' in result:
+            print(f"  Error: {result['error'][:100]}")
+    
+    total_passed = sum(1 for r in results if r.get('all_passed', False))
+    print(f"\nTotal: {total_passed}/{len(models)} models passed all tests")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt b/flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt
new file mode 100644
index 0000000..83dd087
--- /dev/null
+++ b/flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt
@@ -0,0 +1,124 @@
+Model: claude-opus-4-1-20250805
+================================================================================
+
+Generated Code:
+================================================================================
+// File: elementwise_add.cu
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/error.h>
+
+namespace elementwise_ops {
+
+__global__ void ElementwiseAddKernel(const float* a, const float* b, float* c, int64_t n) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    c[idx] = a[idx] + b[idx];
+  }
+}
+
+void ElementwiseAdd(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::TensorView c) {
+  // Validate that all tensors are on the same device
+  DLDevice dev_a = a.device();
+  DLDevice dev_b = b.device();
+  DLDevice dev_c = c.device();
+  
+  if (dev_a.device_type != dev_b.device_type || dev_a.device_id != dev_b.device_id ||
+      dev_a.device_type != dev_c.device_type || dev_a.device_id != dev_c.device_id) {
+    TVM_FFI_THROW(ValueError) << "All tensors must be on the same device";
+  }
+  
+  // Check that we're on CUDA device
+  if (dev_a.device_type != kDLCUDA) {
+    TVM_FFI_THROW(RuntimeError) << "This kernel only supports CUDA devices, got device type: " 
+                                 << dev_a.device_type;
+  }
+  
+  // Validate tensor dimensions - must be 1D
+  if (a.ndim() != 1) {
+    TVM_FFI_THROW(ValueError) << "Input tensor 'a' must be 1D, got " << a.ndim() << "D";
+  }
+  if (b.ndim() != 1) {
+    TVM_FFI_THROW(ValueError) << "Input tensor 'b' must be 1D, got " << b.ndim() << "D";
+  }
+  if (c.ndim() != 1) {
+    TVM_FFI_THROW(ValueError) << "Output tensor 'c' must be 1D, got " << c.ndim() << "D";
+  }
+  
+  // Validate tensor shapes match
+  int64_t n = a.size(0);
+  if (b.size(0) != n) {
+    TVM_FFI_THROW(ValueError) << "Input tensors must have the same size. "
+                               << "a.size=" << n << ", b.size=" << b.size(0);
+  }
+  if (c.size(0) != n) {
+    TVM_FFI_THROW(ValueError) << "Output tensor size must match input size. "
+                               << "Expected " << n << ", got " << c.size(0);
+  }
+  
+  // Validate data types - must be float32
+  DLDataType dtype_a = a.dtype();
+  DLDataType dtype_b = b.dtype();
+  DLDataType dtype_c = c.dtype();
+  
+  if (dtype_a.code != kDLFloat || dtype_a.bits != 32 || dtype_a.lanes != 1) {
+    TVM_FFI_THROW(TypeError) << "Input tensor 'a' must be float32";
+  }
+  if (dtype_b.code != kDLFloat || dtype_b.bits != 32 || dtype_b.lanes != 1) {
+    TVM_FFI_THROW(TypeError) << "Input tensor 'b' must be float32";
+  }
+  if (dtype_c.code != kDLFloat || dtype_c.bits != 32 || dtype_c.lanes != 1) {
+    TVM_FFI_THROW(TypeError) << "Output tensor 'c' must be float32";
+  }
+  
+  // Validate tensors are contiguous
+  if (!a.IsContiguous()) {
+    TVM_FFI_THROW(ValueError) << "Input tensor 'a' must be contiguous";
+  }
+  if (!b.IsContiguous()) {
+    TVM_FFI_THROW(ValueError) << "Input tensor 'b' must be contiguous";
+  }
+  if (!c.IsContiguous()) {
+    TVM_FFI_THROW(ValueError) << "Output tensor 'c' must be contiguous";
+  }
+  
+  // Get data pointers
+  const float* a_data = static_cast<const float*>(a.data_ptr());
+  const float* b_data = static_cast<const float*>(b.data_ptr());
+  float* c_data = static_cast<float*>(c.data_ptr());
+  
+  // Handle empty tensor case
+  if (n == 0) {
+    return;
+  }
+  
+  // Get CUDA stream from environment
+  cudaStream_t stream = static_cast<cudaStream_t>(
+      TVMFFIEnvGetStream(dev_a.device_type, dev_a.device_id));
+  
+  // Configure kernel launch parameters
+  const int64_t threads_per_block = 256;
+  int64_t num_blocks = (n + threads_per_block - 1) / threads_per_block;
+  
+  // Launch kernel
+  ElementwiseAddKernel<<<num_blocks, threads_per_block, 0, stream>>>(
+      a_data, b_data, c_data, n);
+  
+  // Check for kernel launch errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    TVM_FFI_THROW(RuntimeError) << "CUDA kernel launch failed: " << cudaGetErrorString(err);
+  }
+}
+
+// Export the function with name "elementwise_add"
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, ElementwiseAdd);
+
+}  // namespace elementwise_ops
+
+Test Results:
+================================================================================
+Compilation: SUCCESS
+Test 1 (small tensor): PASS
+Test 2 (large tensor): PASS
diff --git a/flashinfer_bench/ffi/test_results/gpt-5-2025-08-07.txt b/flashinfer_bench/ffi/test_results/gpt-5-2025-08-07.txt
new file mode 100644
index 0000000..39a6d59
--- /dev/null
+++ b/flashinfer_bench/ffi/test_results/gpt-5-2025-08-07.txt
@@ -0,0 +1,111 @@
+Model: gpt-5-2025-08-07
+================================================================================
+
+Generated Code:
+================================================================================
+// File: elementwise_add.cu
+#include <cuda_runtime.h>
+
+#include <tvm/ffi/container/tensor.h>   // TensorView: tensor arguments
+#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
+#include <tvm/ffi/error.h>              // TVM_FFI_ICHECK, TVM_FFI_THROW
+#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream
+
+namespace elementwise {
+
+__global__ void ElementwiseAddKernel(const float* __restrict__ a,
+                                     const float* __restrict__ b,
+                                     float* __restrict__ c,
+                                     int64_t n) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    c[idx] = a[idx] + b[idx];
+  }
+}
+
+void ElementwiseAdd(tvm::ffi::TensorView a,
+                    tvm::ffi::TensorView b,
+                    tvm::ffi::TensorView c) {
+  // User input validation
+  if (a.ndim() != 1 || b.ndim() != 1 || c.ndim() != 1) {
+    TVM_FFI_THROW(ValueError) << "All tensors must be 1D, got ndim: a=" << a.ndim()
+                              << ", b=" << b.ndim() << ", c=" << c.ndim();
+  }
+
+  // DType checks: float32
+  auto da = a.dtype();
+  auto db = b.dtype();
+  auto dc = c.dtype();
+  if (!(da.code == kDLFloat && da.bits == 32 && da.lanes == 1)) {
+    TVM_FFI_THROW(TypeError) << "Tensor a must be float32";
+  }
+  if (!(db.code == kDLFloat && db.bits == 32 && db.lanes == 1)) {
+    TVM_FFI_THROW(TypeError) << "Tensor b must be float32";
+  }
+  if (!(dc.code == kDLFloat && dc.bits == 32 && dc.lanes == 1)) {
+    TVM_FFI_THROW(TypeError) << "Tensor c must be float32";
+  }
+
+  // Shape checks
+  int64_t na = a.size(0);
+  int64_t nb = b.size(0);
+  int64_t nc = c.size(0);
+  if (!(na == nb && nb == nc)) {
+    TVM_FFI_THROW(ValueError) << "Shape mismatch: a.size(0)=" << na
+                              << ", b.size(0)=" << nb << ", c.size(0)=" << nc;
+  }
+
+  // Device checks: must be CUDA and all on same device
+  DLDevice da_dev = a.device();
+  DLDevice db_dev = b.device();
+  DLDevice dc_dev = c.device();
+  if (!(da_dev.device_type == kDLCUDA && db_dev.device_type == kDLCUDA &&
+        dc_dev.device_type == kDLCUDA)) {
+    TVM_FFI_THROW(RuntimeError) << "elementwise_add supports only CUDA tensors";
+  }
+  if (!(da_dev.device_id == db_dev.device_id && db_dev.device_id == dc_dev.device_id)) {
+    TVM_FFI_THROW(ValueError) << "All tensors must be on the same CUDA device";
+  }
+
+  // Memory layout: require contiguous for simple linear indexing
+  if (!a.IsContiguous() || !b.IsContiguous() || !c.IsContiguous()) {
+    TVM_FFI_THROW(ValueError) << "All tensors must be contiguous";
+  }
+
+  // Get data pointers
+  const float* a_ptr = static_cast<const float*>(a.data_ptr());
+  const float* b_ptr = static_cast<const float*>(b.data_ptr());
+  float* c_ptr = static_cast<float*>(c.data_ptr());
+
+  int64_t n = na;
+  if (n == 0) return;
+
+  // Get CUDA stream from TVM environment
+  cudaStream_t stream = static_cast<cudaStream_t>(
+      TVMFFIEnvGetStream(dc_dev.device_type, dc_dev.device_id));
+
+  // Launch configuration
+  const int threads = 256;
+  unsigned int blocks = static_cast<unsigned int>((n + threads - 1) / threads);
+  if (blocks == 0) return;
+
+  // Launch kernel
+  ElementwiseAddKernel<<<blocks, threads, 0, stream>>>(a_ptr, b_ptr, c_ptr, n);
+
+  // Check for launch errors
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    TVM_FFI_THROW(RuntimeError) << "CUDA kernel launch failed: " << cudaGetErrorString(err);
+  }
+}
+
+}  // namespace elementwise
+
+// Export the function with name "elementwise_add"
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, elementwise::ElementwiseAdd)
+
+Test Results:
+================================================================================
+Compilation: SUCCESS
+Test 1 (small tensor): PASS
+Test 2 (large tensor): PASS
diff --git a/flashinfer_bench/ffi/test_results/gpt-5-mini-2025-08-07.txt b/flashinfer_bench/ffi/test_results/gpt-5-mini-2025-08-07.txt
new file mode 100644
index 0000000..24dd09b
--- /dev/null
+++ b/flashinfer_bench/ffi/test_results/gpt-5-mini-2025-08-07.txt
@@ -0,0 +1,129 @@
+Model: gpt-5-mini-2025-08-07
+================================================================================
+
+Generated Code:
+================================================================================
+// File: elementwise_add.cu
+#include <cstdint>
+#include <cstddef>
+#include <cuda_runtime.h>
+
+#include <tvm/ffi/container/tensor.h>   // TensorView: tensor arguments
+#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream, TVMFFIEnvTensorAlloc
+#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
+#include <tvm/ffi/error.h>              // TVM_FFI_ICHECK, TVM_FFI_THROW
+
+using tvm::ffi::TensorView;
+
+namespace tvm_extra_kernels {
+
+__global__ void ElementwiseAddKernel(const float* a, const float* b, float* c, int64_t n) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    c[idx] = a[idx] + b[idx];
+  }
+}
+
+void ElementwiseAdd(TensorView a, TensorView b, TensorView c) {
+  // Basic shape / type validation (user-facing errors)
+  if (a.ndim() != 1) {
+    TVM_FFI_THROW(ValueError) << "elementwise_add: input tensor 'a' must be 1D, got "
+                              << a.ndim() << "D";
+  }
+  if (b.ndim() != 1) {
+    TVM_FFI_THROW(ValueError) << "elementwise_add: input tensor 'b' must be 1D, got "
+                              << b.ndim() << "D";
+  }
+  if (c.ndim() != 1) {
+    TVM_FFI_THROW(ValueError) << "elementwise_add: output tensor 'c' must be 1D, got "
+                              << c.ndim() << "D";
+  }
+
+  // dtype validation: float32
+  auto da = a.dtype();
+  auto db = b.dtype();
+  auto dc = c.dtype();
+  if (!(da.code == kDLFloat && da.bits == 32 && da.lanes == 1)) {
+    TVM_FFI_THROW(TypeError) << "elementwise_add: tensor 'a' must be float32";
+  }
+  if (!(db.code == kDLFloat && db.bits == 32 && db.lanes == 1)) {
+    TVM_FFI_THROW(TypeError) << "elementwise_add: tensor 'b' must be float32";
+  }
+  if (!(dc.code == kDLFloat && dc.bits == 32 && dc.lanes == 1)) {
+    TVM_FFI_THROW(TypeError) << "elementwise_add: tensor 'c' must be float32";
+  }
+
+  // shape / size validation
+  int64_t n = a.size(0);
+  if (b.size(0) != n) {
+    TVM_FFI_THROW(ValueError) << "elementwise_add: size mismatch between 'a' and 'b' ("
+                              << n << " vs " << b.size(0) << ")";
+  }
+  if (c.size(0) != n) {
+    TVM_FFI_THROW(ValueError) << "elementwise_add: output 'c' size must match inputs (expected "
+                              << n << ", got " << c.size(0) << ")";
+  }
+
+  // Ensure tensors are contiguous for simple indexing
+  if (!a.IsContiguous()) {
+    TVM_FFI_THROW(ValueError) << "elementwise_add: input 'a' must be contiguous";
+  }
+  if (!b.IsContiguous()) {
+    TVM_FFI_THROW(ValueError) << "elementwise_add: input 'b' must be contiguous";
+  }
+  if (!c.IsContiguous()) {
+    TVM_FFI_THROW(ValueError) << "elementwise_add: output 'c' must be contiguous";
+  }
+
+  // Ensure all tensors are on the same device
+  DLDevice da_dev = a.device();
+  DLDevice db_dev = b.device();
+  DLDevice dc_dev = c.device();
+  if (!(da_dev.device_type == db_dev.device_type && da_dev.device_id == db_dev.device_id &&
+        da_dev.device_type == dc_dev.device_type && da_dev.device_id == dc_dev.device_id)) {
+    TVM_FFI_THROW(ValueError) << "elementwise_add: tensors must be on the same device";
+  }
+
+  // Pointers to data
+  float* a_ptr = static_cast<float*>(a.data_ptr());
+  float* b_ptr = static_cast<float*>(b.data_ptr());
+  float* c_ptr = static_cast<float*>(c.data_ptr());
+
+  // Dispatch by device type
+  if (da_dev.device_type == kDLCUDA) {
+    // GPU path: launch CUDA kernel on environment-managed stream
+    TVMFFIStreamHandle raw_stream = TVMFFIEnvGetStream(da_dev.device_type, da_dev.device_id);
+    cudaStream_t stream = static_cast<cudaStream_t>(raw_stream);
+
+    const int threads = 256;
+    const int64_t blocks = (n + threads - 1) / threads;
+    ElementwiseAddKernel<<<static_cast<int>(blocks), threads, 0, stream>>>(a_ptr, b_ptr, c_ptr, n);
+
+    // Check kernel launch error
+    cudaError_t err = cudaGetLastError();
+    if (err != cudaSuccess) {
+      TVM_FFI_THROW(RuntimeError) << "elementwise_add: CUDA kernel launch failed: "
+                                  << cudaGetErrorString(err);
+    }
+    // Note: we do not synchronize here; environment may manage synchronization.
+  } else if (da_dev.device_type == kDLCPU || da_dev.device_type == kDLCUDAHost) {
+    // CPU path: simple element-wise loop
+    for (int64_t i = 0; i < n; ++i) {
+      c_ptr[i] = a_ptr[i] + b_ptr[i];
+    }
+  } else {
+    TVM_FFI_THROW(RuntimeError) << "elementwise_add: unsupported device type "
+                                << da_dev.device_type;
+  }
+}
+
+// Export the function with name "elementwise_add"
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, ElementwiseAdd);
+
+}  // namespace tvm_extra_kernels
+
+Test Results:
+================================================================================
+Compilation: SUCCESS
+Test 1 (small tensor): PASS
+Test 2 (large tensor): PASS
diff --git a/flashinfer_bench/ffi/test_results/o3.txt b/flashinfer_bench/ffi/test_results/o3.txt
new file mode 100644
index 0000000..212fd5a
--- /dev/null
+++ b/flashinfer_bench/ffi/test_results/o3.txt
@@ -0,0 +1,141 @@
+Model: o3
+================================================================================
+
+Generated Code:
+================================================================================
+// File: elementwise_add.cu
+//
+// Simple element-wise addition kernel exported through TVM-FFI
+//   c[i] = a[i] + b[i]
+//
+// Build example (CUDA >= 11):
+// nvcc -std=c++17 -O2 --shared -Xcompiler -fPIC elementwise_add.cu -o libelementwise_add.so
+//
+// ------------------------------------------------------------------------------
+
+#include <cuda_runtime.h>
+
+#include <tvm/ffi/container/tensor.h>   // TensorView
+#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream
+#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
+#include <tvm/ffi/error.h>              // ICHECK / THROW helpers
+
+namespace tvm_add {
+
+/*!
+ * \brief CUDA kernel: c[i] = a[i] + b[i]
+ */
+__global__ void ElementwiseAddKernel(const float* a,
+                                     const float* b,
+                                     float* c,
+                                     int64_t n) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    c[idx] = a[idx] + b[idx];
+  }
+}
+
+/*!
+ * \brief Host wrapper visible to TVM-FFI.
+ *
+ * Signature:  void(TensorView a, TensorView b, TensorView c)
+ */
+void ElementwiseAdd(tvm::ffi::TensorView a,
+                    tvm::ffi::TensorView b,
+                    tvm::ffi::TensorView c) {
+  /* ---------------------- Input validation ---------------------- */
+  // 1-D check
+  if (a.ndim() != 1 || b.ndim() != 1 || c.ndim() != 1) {
+    TVM_FFI_THROW(ValueError)
+        << "All tensors must be 1-D. Got ndim = "
+        << a.ndim() << ", " << b.ndim() << ", " << c.ndim();
+  }
+
+  // dtype check (float32 only)
+  auto check_f32 = [](const tvm::ffi::TensorView& t, const char* name) {
+    DLDataType dt = t.dtype();
+    if (dt.code != kDLFloat || dt.bits != 32 || dt.lanes != 1) {
+      TVM_FFI_THROW(TypeError)
+          << "Tensor " << name << " must be float32. "
+          << "Got (code=" << static_cast<int>(dt.code)
+          << ", bits=" << dt.bits
+          << ", lanes=" << dt.lanes << ")";
+    }
+  };
+  check_f32(a, "a");
+  check_f32(b, "b");
+  check_f32(c, "c");
+
+  // size / shape check
+  int64_t n = a.size(0);
+  if (b.size(0) != n || c.size(0) != n) {
+    TVM_FFI_THROW(ValueError)
+        << "Mismatched tensor sizes: "
+        << "a=" << a.size(0) << ", "
+        << "b=" << b.size(0) << ", "
+        << "c=" << c.size(0);
+  }
+
+  // device check (all on same CUDA device)
+  DLDevice dev_a = a.device();
+  DLDevice dev_b = b.device();
+  DLDevice dev_c = c.device();
+  if (dev_a.device_type != kDLCUDA || dev_b.device_type != kDLCUDA ||
+      dev_c.device_type != kDLCUDA) {
+    TVM_FFI_THROW(RuntimeError)
+        << "All tensors must reside on CUDA device. "
+        << "Device types: a=" << dev_a.device_type
+        << ", b=" << dev_b.device_type
+        << ", c=" << dev_c.device_type;
+  }
+  if (dev_a.device_id != dev_b.device_id ||
+      dev_a.device_id != dev_c.device_id) {
+    TVM_FFI_THROW(RuntimeError)
+        << "All tensors must be on the same CUDA device. "
+        << "Device IDs: a=" << dev_a.device_id
+        << ", b=" << dev_b.device_id
+        << ", c=" << dev_c.device_id;
+  }
+
+  // Optional: require contiguous memory for simplest access pattern
+  if (!a.IsContiguous() || !b.IsContiguous() || !c.IsContiguous()) {
+    TVM_FFI_THROW(ValueError) << "All tensors must be contiguous.";
+  }
+
+  /* -------------------- Launch CUDA kernel ---------------------- */
+  if (n == 0) return;  // nothing to do
+
+  const float* a_ptr = static_cast<const float*>(a.data_ptr());
+  const float* b_ptr = static_cast<const float*>(b.data_ptr());
+  float*       c_ptr = static_cast<float*>(c.data_ptr());
+
+  const int threads = 256;
+  const int blocks  = static_cast<int>((n + threads - 1) / threads);
+
+  // Get stream managed by runtime
+  cudaStream_t stream = static_cast<cudaStream_t>(
+      TVMFFIEnvGetStream(dev_a.device_type, dev_a.device_id));
+
+  ElementwiseAddKernel<<<blocks, threads, 0, stream>>>(a_ptr, b_ptr, c_ptr, n);
+
+#ifndef NDEBUG
+  cudaError_t err = cudaGetLastError();
+  if (err != cudaSuccess) {
+    TVM_FFI_THROW(RuntimeError)
+        << "CUDA kernel launch failed: " << cudaGetErrorString(err);
+  }
+#endif
+}
+
+}  // namespace tvm_add
+
+// -----------------------------------------------------------------------------
+// TVM-FFI export
+// -----------------------------------------------------------------------------
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, tvm_add::ElementwiseAdd);
+
+Test Results:
+================================================================================
+Compilation: SUCCESS
+Test 1 (small tensor): PASS
+Test 2 (large tensor): PASS
diff --git a/flashinfer_bench/ffi/test_results/o4-mini-2025-04-16.txt b/flashinfer_bench/ffi/test_results/o4-mini-2025-04-16.txt
new file mode 100644
index 0000000..0f67d4b
--- /dev/null
+++ b/flashinfer_bench/ffi/test_results/o4-mini-2025-04-16.txt
@@ -0,0 +1,122 @@
+Model: o4-mini-2025-04-16
+================================================================================
+
+Generated Code:
+================================================================================
+// File: elementwise_add.cu
+
+#include <tvm/ffi/container/tensor.h>    // TensorView
+#include <tvm/ffi/extra/c_env_api.h>     // TVMFFIEnvGetStream
+#include <tvm/ffi/function.h>            // TVM_FFI_DLL_EXPORT_TYPED_FUNC
+#include <tvm/ffi/error.h>               // TVM_FFI_ICHECK, TVM_FFI_THROW
+#include <cuda_runtime.h>                // cudaStream_t
+
+// CUDA kernel: c[i] = a[i] + b[i]
+__global__ void ElementwiseAddKernel(const float* a,
+                                     const float* b,
+                                     float*       c,
+                                     int64_t      n) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x
+                + threadIdx.x;
+  if (idx < n) {
+    c[idx] = a[idx] + b[idx];
+  }
+}
+
+// Host wrapper exposed via TVM FFI
+void ElementwiseAdd(tvm::ffi::TensorView a,
+                    tvm::ffi::TensorView b,
+                    tvm::ffi::TensorView c) {
+  // 1) Dimensionality checks
+  if (a.ndim() != 1) {
+    TVM_FFI_THROW(ValueError)
+        << "elementwise_add: expected 1D tensor for `a`, got "
+        << a.ndim() << "D";
+  }
+  if (b.ndim() != 1) {
+    TVM_FFI_THROW(ValueError)
+        << "elementwise_add: expected 1D tensor for `b`, got "
+        << b.ndim() << "D";
+  }
+  if (c.ndim() != 1) {
+    TVM_FFI_THROW(ValueError)
+        << "elementwise_add: expected 1D tensor for `c`, got "
+        << c.ndim() << "D";
+  }
+
+  int64_t n = a.size(0);
+  if (b.size(0) != n) {
+    TVM_FFI_THROW(ValueError)
+        << "elementwise_add: size mismatch — `a` has length "
+        << n << " but `b` has length " << b.size(0);
+  }
+  if (c.size(0) != n) {
+    TVM_FFI_THROW(ValueError)
+        << "elementwise_add: size mismatch — `a` has length "
+        << n << " but `c` has length " << c.size(0);
+  }
+
+  // 2) DType checks (float32)
+  auto da = a.dtype();
+  if (da.code != kDLFloat || da.bits != 32 || da.lanes != 1) {
+    TVM_FFI_THROW(TypeError)
+        << "elementwise_add: expected float32 dtype for `a`";
+  }
+  auto db = b.dtype();
+  if (db.code != kDLFloat || db.bits != 32 || db.lanes != 1) {
+    TVM_FFI_THROW(TypeError)
+        << "elementwise_add: expected float32 dtype for `b`";
+  }
+  auto dc = c.dtype();
+  if (dc.code != kDLFloat || dc.bits != 32 || dc.lanes != 1) {
+    TVM_FFI_THROW(TypeError)
+        << "elementwise_add: expected float32 dtype for `c`";
+  }
+
+  // 3) (Optional) Contiguity checks
+  TVM_FFI_ICHECK(a.IsContiguous()) << "elementwise_add: `a` must be contiguous";
+  TVM_FFI_ICHECK(b.IsContiguous()) << "elementwise_add: `b` must be contiguous";
+  TVM_FFI_ICHECK(c.IsContiguous()) << "elementwise_add: `c` must be contiguous";
+
+  // 4) Device checks
+  DLDevice dev = a.device();
+  if (dev.device_type != kDLCUDA) {
+    TVM_FFI_THROW(RuntimeError)
+        << "elementwise_add: only CUDA device supported";
+  }
+  if (b.device().device_type != dev.device_type ||
+      b.device().device_id   != dev.device_id) {
+    TVM_FFI_THROW(ValueError)
+        << "elementwise_add: `b` must be on the same CUDA device as `a`";
+  }
+  if (c.device().device_type != dev.device_type ||
+      c.device().device_id   != dev.device_id) {
+    TVM_FFI_THROW(ValueError)
+        << "elementwise_add: `c` must be on the same CUDA device as `a`";
+  }
+
+  // 5) Pointers
+  const float* a_ptr = static_cast<const float*>(a.data_ptr());
+  const float* b_ptr = static_cast<const float*>(b.data_ptr());
+  float*       c_ptr = static_cast<float*>(c.data_ptr());
+
+  // 6) Get CUDA stream from TVM runtime
+  cudaStream_t stream =
+      static_cast<cudaStream_t>(TVMFFIEnvGetStream(dev.device_type,
+                                                  dev.device_id));
+
+  // 7) Launch kernel
+  constexpr int THREADS = 256;
+  int blocks = static_cast<int>((n + THREADS - 1) / THREADS);
+  ElementwiseAddKernel<<<blocks, THREADS, 0, stream>>>(
+      a_ptr, b_ptr, c_ptr, n);
+}
+
+// Export for TVM FFI under the name "elementwise_add"
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, ElementwiseAdd);
+
+Test Results:
+================================================================================
+Compilation: SUCCESS
+Test 1 (small tensor): PASS
+Test 2 (large tensor): PASS

From 25092adc6d4fae29e28fe68d46e5a7e4a42cfc1d Mon Sep 17 00:00:00 2001
From: zanderjiang <zander.jiang@gmail.com>
Date: Mon, 10 Nov 2025 13:58:28 -0500
Subject: [PATCH 02/10] formatting

---
 .gitignore                                    |   2 +-
 flashinfer_bench/ffi/ffi_prompt.py            |  32 ++--
 flashinfer_bench/ffi/load_inline.py           |  16 +-
 .../ffi/test_existing_prompt_sol.py           | 143 +++++++--------
 flashinfer_bench/ffi/test_prompt.py           | 168 ++++++++----------
 .../test_results/claude-opus-4-1-20250805.txt |  28 +--
 6 files changed, 185 insertions(+), 204 deletions(-)

diff --git a/.gitignore b/.gitignore
index 35ca3cb..6a343dc 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,4 +48,4 @@ web/**/public
 tmp/
 AGENTS.md
 
-.env
\ No newline at end of file
+.env
diff --git a/flashinfer_bench/ffi/ffi_prompt.py b/flashinfer_bench/ffi/ffi_prompt.py
index 0a9db3e..59ee283 100644
--- a/flashinfer_bench/ffi/ffi_prompt.py
+++ b/flashinfer_bench/ffi/ffi_prompt.py
@@ -150,7 +150,7 @@
   int64_t height = x.size(0);
   int64_t width = x.size(1);
   float* data = static_cast<float*>(x.data_ptr());
-  
+
   // If contiguous (row-major), access as: data[i * width + j]
   TVM_FFI_ICHECK(x.IsContiguous()) << "Expected contiguous tensor";
 }
@@ -192,17 +192,17 @@
   TVM_FFI_ICHECK_EQ(x.ndim(), 1) << "x must be 1D";
   TVM_FFI_ICHECK_EQ(y.ndim(), 1) << "y must be 1D";
   TVM_FFI_ICHECK_EQ(x.size(0), y.size(0)) << "Shape mismatch";
-  
+
   // Get data pointers
   float* x_data = static_cast<float*>(x.data_ptr());
   float* y_data = static_cast<float*>(y.data_ptr());
   int64_t n = x.size(0);
-  
+
   // Get CUDA stream from environment
   DLDevice dev = x.device();
   cudaStream_t stream = static_cast<cudaStream_t>(
       TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-  
+
   // Launch kernel
   int64_t threads = 256;
   int64_t blocks = (n + threads - 1) / threads;
@@ -251,17 +251,17 @@
   TVM_FFI_ICHECK_EQ(x.ndim(), 1) << "x must be 1D";
   TVM_FFI_ICHECK_EQ(y.ndim(), 1) << "y must be 1D";
   TVM_FFI_ICHECK_EQ(x.size(0), y.size(0)) << "Shape mismatch";
-  
+
   // Get data pointers
   float* x_data = static_cast<float*>(x.data_ptr());
   float* y_data = static_cast<float*>(y.data_ptr());
   int64_t n = x.size(0);
-  
+
   // Get CUDA stream from environment
   DLDevice dev = x.device();
   cudaStream_t stream = static_cast<cudaStream_t>(
       TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-  
+
   // Launch kernel
   int64_t threads = 256;
   int64_t blocks = (n + threads - 1) / threads;
@@ -284,7 +284,7 @@
 **Methods:**
 - `void* data_ptr() const` - Returns raw data pointer
 - `DLDevice device() const` - Returns device info (`.device_type`, `.device_id`)
-- `int32_t ndim() const` - Returns number of dimensions  
+- `int32_t ndim() const` - Returns number of dimensions
 - `DLDataType dtype() const` - Returns data type (`.code`, `.bits`, `.lanes`)
 - `ShapeView shape() const` - Returns shape array (indexable: `shape()[0]`, `shape()[1]`, ...)
 - `ShapeView strides() const` - Returns strides array
@@ -306,7 +306,7 @@
 - `DLManagedTensorVersioned* ToDLPackVersioned() const` - Convert to versioned DLPack
 - `const DLTensor* GetDLTensorPtr() const` - Get underlying DLTensor pointer
 
-### TensorView Class  
+### TensorView Class
 Non-owning lightweight view of a Tensor. Kernel entrypoints should use `tvm::ffi::TensorView` (or `const TensorView&`) for tensor inputs/outputs.
 
 **Constructors:**
@@ -509,7 +509,7 @@
 ### Stream Management
 ```cpp
 // Set current stream for a device
-int TVMFFIEnvSetStream(int32_t device_type, int32_t device_id, 
+int TVMFFIEnvSetStream(int32_t device_type, int32_t device_id,
                        TVMFFIStreamHandle stream,
                        TVMFFIStreamHandle* opt_out_original_stream)
 
@@ -522,7 +522,7 @@
     TVMFFIEnvGetStream(dev.device_type, dev.device_id));
 ```
 
-### Tensor Allocation  
+### Tensor Allocation
 ```cpp
 // Set DLPack allocator (TLS or global)
 int TVMFFIEnvSetDLPackManagedTensorAllocator(
@@ -577,7 +577,7 @@
   DLDataType dt = x.dtype();
   TVM_FFI_ICHECK_EQ(dt.code, kDLFloat) << "Expected float dtype";
   TVM_FFI_ICHECK_EQ(dt.bits, 32) << "Expected 32-bit dtype";
-  
+
   // Or for user-facing errors:
   if (dt.code != kDLFloat || dt.bits != 32) {
     TVM_FFI_THROW(TypeError) << "Expected float32, got "
@@ -603,10 +603,10 @@
   int64_t height = x.size(0);
   int64_t width = x.size(1);
   float* data = static_cast<float*>(x.data_ptr());
-  
+
   // If contiguous (row-major), access as: data[i * width + j]
   TVM_FFI_ICHECK(x.IsContiguous()) << "Expected contiguous tensor";
-  
+
   // With strides:
   int64_t stride0 = x.stride(0);
   int64_t stride1 = x.stride(1);
@@ -638,10 +638,10 @@
   ffi::ShapeView shape = input.shape();
   DLDataType dtype = input.dtype();
   DLDevice device = input.device();
-  
+
   // Use environment allocator
   ffi::Tensor output = ffi::Tensor::FromEnvAlloc(
       TVMFFIEnvTensorAlloc, shape, dtype, device);
 }
 ```
-"""
\ No newline at end of file
+"""
diff --git a/flashinfer_bench/ffi/load_inline.py b/flashinfer_bench/ffi/load_inline.py
index 8a70492..fc1811c 100644
--- a/flashinfer_bench/ffi/load_inline.py
+++ b/flashinfer_bench/ffi/load_inline.py
@@ -1,8 +1,9 @@
+import re
+from pathlib import Path
+
 import torch
-from tvm_ffi import Module
 import tvm_ffi.cpp
-from pathlib import Path
-import re
+from tvm_ffi import Module
 
 cuda_source = """
 // File: compile/add_one_cuda.cu
@@ -34,12 +35,11 @@
 }  // namespace tvm_ffi_example_cuda
 """
 
+
 def main():
-    mod: Module = tvm_ffi.cpp.load_inline(
-        name='add_one_cuda',
-        cuda_sources=cuda_source,
-    )
+    mod: Module = tvm_ffi.cpp.load_inline(name="add_one_cuda", cuda_sources=cuda_source)
     print("Compilation successful")
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/flashinfer_bench/ffi/test_existing_prompt_sol.py b/flashinfer_bench/ffi/test_existing_prompt_sol.py
index ef13b94..d597f2f 100644
--- a/flashinfer_bench/ffi/test_existing_prompt_sol.py
+++ b/flashinfer_bench/ffi/test_existing_prompt_sol.py
@@ -3,46 +3,47 @@
 Reads .txt files containing generated code and runs the same test cases.
 """
 
+import re
+from pathlib import Path
+
 import torch
-from tvm_ffi import Module
 import tvm_ffi.cpp
-from pathlib import Path
-import re
+from tvm_ffi import Module
 
 
 def extract_code_from_file(file_path: Path) -> tuple[str, str]:
     """Extract model name and generated code from a test result file.
-    
+
     Returns:
         tuple: (model_name, code)
     """
-    with open(file_path, 'r') as f:
+    with open(file_path, "r") as f:
         content = f.read()
-    
+
     # Extract model name
-    model_match = re.search(r'Model: (.+?)\n', content)
+    model_match = re.search(r"Model: (.+?)\n", content)
     model_name = model_match.group(1) if model_match else file_path.stem
-    
+
     # Extract code between "Generated Code:" and "Test Results:"
-    code_pattern = r'Generated Code:\n={80}\n(.*?)\n(?:Test Results:|$)'
+    code_pattern = r"Generated Code:\n={80}\n(.*?)\n(?:Test Results:|$)"
     code_match = re.search(code_pattern, content, re.DOTALL)
-    
+
     if code_match:
         code = code_match.group(1).strip()
     else:
         # Try alternative pattern - just get everything after "Generated Code:"
-        alt_pattern = r'Generated Code:\n={80}\n(.*)'
+        alt_pattern = r"Generated Code:\n={80}\n(.*)"
         alt_match = re.search(alt_pattern, content, re.DOTALL)
         if alt_match:
             # Get everything until we see "Test Results:" or end of file
             full_content = alt_match.group(1)
-            if 'Test Results:' in full_content:
-                code = full_content.split('Test Results:')[0].strip()
+            if "Test Results:" in full_content:
+                code = full_content.split("Test Results:")[0].strip()
             else:
                 code = full_content.strip()
         else:
             raise ValueError(f"Could not extract code from {file_path}")
-    
+
     return model_name, code
 
 
@@ -50,34 +51,34 @@ def test_kernel(mod: Module, test_name: str) -> bool:
     """Test the generated kernel with simple test cases."""
     try:
         print(f"\n  Running {test_name}...")
-        
+
         if test_name == "test_small":
             # Test 1: Small tensor
-            a = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32, device='cuda')
-            b = torch.tensor([10.0, 20.0, 30.0, 40.0, 50.0], dtype=torch.float32, device='cuda')
+            a = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32, device="cuda")
+            b = torch.tensor([10.0, 20.0, 30.0, 40.0, 50.0], dtype=torch.float32, device="cuda")
             c = torch.empty_like(a)
-            
+
             mod.elementwise_add(a, b, c)
             expected = a + b
-            
+
             torch.testing.assert_close(c, expected)
             print(f"  ✓ {test_name} passed")
             return True
-            
+
         elif test_name == "test_large":
             # Test 2: Larger tensor
             n = 10000
-            a = torch.randn(n, dtype=torch.float32, device='cuda')
-            b = torch.randn(n, dtype=torch.float32, device='cuda')
+            a = torch.randn(n, dtype=torch.float32, device="cuda")
+            b = torch.randn(n, dtype=torch.float32, device="cuda")
             c = torch.empty_like(a)
-            
+
             mod.elementwise_add(a, b, c)
             expected = a + b
-            
+
             torch.testing.assert_close(c, expected)
             print(f"  ✓ {test_name} passed")
             return True
-            
+
     except Exception as e:
         print(f"  ✗ {test_name} failed: {e}")
         return False
@@ -94,15 +95,15 @@ def update_test_results_in_file(file_path: Path, model_name: str, code: str, res
     content += "Test Results:\n"
     content += "=" * 80 + "\n"
     content += f"Compilation: {results['compilation']}\n"
-    
+
     if "error" in results:
         content += f"Error: {results['error']}\n"
     else:
         content += f"Test 1 (small tensor): {results['test_small']}\n"
         content += f"Test 2 (large tensor): {results['test_large']}\n"
-    
+
     # Write back to file
-    with open(file_path, 'w') as f:
+    with open(file_path, "w") as f:
         f.write(content)
 
 
@@ -111,124 +112,126 @@ def test_saved_code(file_path: Path):
     print(f"\n{'='*80}")
     print(f"Testing: {file_path.name}")
     print(f"{'='*80}")
-    
+
     try:
         # Extract code
         model_name, cuda_code = extract_code_from_file(file_path)
         print(f"Model: {model_name}")
         print(f"Extracted {len(cuda_code)} characters of code")
-        
+
         # Try to compile
         print("Attempting to compile with tvm_ffi...")
         try:
             mod: Module = tvm_ffi.cpp.load_inline(
-                name=f'elementwise_add_{model_name.replace("-", "_")}',
-                cuda_sources=cuda_code
+                name=f'elementwise_add_{model_name.replace("-", "_")}', cuda_sources=cuda_code
             )
             print("✓ Compilation successful!")
-            
+
             # Run tests
             print("\nRunning tests...")
             test1_passed = test_kernel(mod, "test_small")
             test2_passed = test_kernel(mod, "test_large")
-            
+
             all_passed = test1_passed and test2_passed
             status = "✓ ALL TESTS PASSED" if all_passed else "✗ SOME TESTS FAILED"
             print(f"\n{status}")
-            
+
             # Update the file with new test results
-            update_test_results_in_file(file_path, model_name, cuda_code, {
-                "compilation": "SUCCESS",
-                "test_small": "PASS" if test1_passed else "FAIL",
-                "test_large": "PASS" if test2_passed else "FAIL"
-            })
-            
+            update_test_results_in_file(
+                file_path,
+                model_name,
+                cuda_code,
+                {
+                    "compilation": "SUCCESS",
+                    "test_small": "PASS" if test1_passed else "FAIL",
+                    "test_large": "PASS" if test2_passed else "FAIL",
+                },
+            )
+
             return {
                 "file": file_path.name,
                 "model": model_name,
                 "compilation": "success",
                 "test_small": test1_passed,
                 "test_large": test2_passed,
-                "all_passed": all_passed
+                "all_passed": all_passed,
             }
-            
+
         except Exception as e:
             print(f"✗ Compilation failed: {e}")
-            
+
             # Update the file with compilation failure
-            update_test_results_in_file(file_path, model_name, cuda_code, {
-                "compilation": "FAILED",
-                "error": str(e)
-            })
-            
+            update_test_results_in_file(
+                file_path, model_name, cuda_code, {"compilation": "FAILED", "error": str(e)}
+            )
+
             return {
                 "file": file_path.name,
                 "model": model_name,
                 "compilation": "failed",
                 "error": str(e),
-                "all_passed": False
+                "all_passed": False,
             }
-            
+
     except Exception as e:
         print(f"✗ Error processing file: {e}")
         return {
             "file": file_path.name,
             "compilation": "error",
             "error": str(e),
-            "all_passed": False
+            "all_passed": False,
         }
 
 
 def main():
     """Test all saved code from test_results directory."""
-    
+
     # Get test_results directory
     test_results_dir = Path(__file__).parent / "test_results"
-    
+
     if not test_results_dir.exists():
         print(f"Error: {test_results_dir} does not exist")
         return
-    
+
     # Find all .txt files
     txt_files = sorted(test_results_dir.glob("*.txt"))
-    
+
     if not txt_files:
         print(f"No .txt files found in {test_results_dir}")
         return
-    
-    print("="*80)
+
+    print("=" * 80)
     print("Re-testing Saved LLM Generated Code")
-    print("="*80)
+    print("=" * 80)
     print(f"Files to test: {len(txt_files)}")
     print(f"Directory: {test_results_dir}")
-    
+
     results = []
-    
+
     for idx, file_path in enumerate(txt_files, 1):
         print(f"\n[{idx}/{len(txt_files)}] Testing {file_path.name}...")
         result = test_saved_code(file_path)
         results.append(result)
-    
+
     # Print summary
     print(f"\n{'='*80}")
     print("FINAL SUMMARY")
     print(f"{'='*80}")
-    
+
     for result in results:
-        file_name = result['file']
-        status = "✓ PASS" if result.get('all_passed', False) else "✗ FAIL"
+        file_name = result["file"]
+        status = "✓ PASS" if result.get("all_passed", False) else "✗ FAIL"
         print(f"{file_name:40} {status}")
-        if 'error' in result:
-            error_msg = result['error']
+        if "error" in result:
+            error_msg = result["error"]
             # Truncate long errors
             if len(error_msg) > 100:
                 error_msg = error_msg[:100] + "..."
             print(f"  Error: {error_msg}")
-    
-    total_passed = sum(1 for r in results if r.get('all_passed', False))
+
+    total_passed = sum(1 for r in results if r.get("all_passed", False))
     print(f"\nTotal: {total_passed}/{len(results)} files passed all tests")
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/flashinfer_bench/ffi/test_prompt.py b/flashinfer_bench/ffi/test_prompt.py
index fa87ba9..bea64ca 100644
--- a/flashinfer_bench/ffi/test_prompt.py
+++ b/flashinfer_bench/ffi/test_prompt.py
@@ -3,15 +3,15 @@
 Tests elementwise add kernel generation across multiple models.
 """
 
+import os
+import re
+from pathlib import Path
+
 import torch
-from tvm_ffi import Module
 import tvm_ffi.cpp
-from pathlib import Path
-import re
-import os
 from dotenv import load_dotenv
-
 from ffi_prompt import FFI_PROMPT_SIMPLE
+from tvm_ffi import Module
 
 load_dotenv()
 
@@ -34,19 +34,11 @@ def get_model_config(model_name: str):
     openai_key = os.getenv("OPENAI_API_KEY")
     # Anthropic models
     anthropic_key = os.getenv("ANTHROPIC_API_KEY")
-    
+
     if model_name in ["gpt-5-2025-08-07", "o3", "gpt-5-mini-2025-08-07", "o4-mini-2025-04-16"]:
-        return {
-            "provider": "openai",
-            "api_key": openai_key,
-            "model": model_name
-        }
+        return {"provider": "openai", "api_key": openai_key, "model": model_name}
     elif model_name in ["claude-opus-4-1-20250805", "claude-sonnet-4-5-20250805"]:
-        return {
-            "provider": "anthropic",
-            "api_key": anthropic_key,
-            "model": model_name
-        }
+        return {"provider": "anthropic", "api_key": anthropic_key, "model": model_name}
     else:
         raise ValueError(f"Unknown model: {model_name}")
 
@@ -54,59 +46,52 @@ def get_model_config(model_name: str):
 def call_openai_model(model_name: str, api_key: str, prompt: str) -> str:
     """Call OpenAI API to generate code."""
     import openai
-    
+
     client = openai.OpenAI(api_key=api_key)
-    
+
     if model_name in ["o3", "o4-mini-2025-04-16"]:
         response = client.chat.completions.create(
             model=model_name,
-            messages=[
-                {"role": "user", "content": prompt}
-            ],
-            reasoning_effort="high"
+            messages=[{"role": "user", "content": prompt}],
+            reasoning_effort="high",
         )
     else:
         response = client.chat.completions.create(
             model=model_name,
             messages=[
                 {"role": "system", "content": ELEMENTWISE_ADD_PROMPT},
-                {"role": "user", "content": FFI_PROMPT_SIMPLE}
-            ]
+                {"role": "user", "content": FFI_PROMPT_SIMPLE},
+            ],
         )
-    
+
     return response.choices[0].message.content
 
 
 def call_anthropic_model(model_name: str, api_key: str, prompt: str) -> str:
     import anthropic
-    
+
     client = anthropic.Anthropic(api_key=api_key)
-    
+
     response = client.messages.create(
         model=model_name,
         max_tokens=4096,
         system=ELEMENTWISE_ADD_PROMPT,
-        messages=[
-            {"role": "user", "content": FFI_PROMPT_SIMPLE}
-        ]
+        messages=[{"role": "user", "content": FFI_PROMPT_SIMPLE}],
     )
-    
+
     return response.content[0].text
 
 
 def extract_cuda_code(response: str) -> str:
-    patterns = [
-        r"```(?:cpp|cuda|c\+\+)\n(.*?)```",
-        r"```\n(.*?)```",
-    ]
-    
+    patterns = [r"```(?:cpp|cuda|c\+\+)\n(.*?)```", r"```\n(.*?)```"]
+
     for pattern in patterns:
         matches = re.findall(pattern, response, re.DOTALL)
         if matches:
             for match in matches:
                 if "TVM_FFI" in match or "TensorView" in match:
                     return match.strip()
-    
+
     return response.strip()
 
 
@@ -114,34 +99,34 @@ def test_kernel(mod: Module, test_name: str) -> bool:
     """Test the generated kernel with simple test cases."""
     try:
         print(f"\n  Running {test_name}...")
-        
+
         if test_name == "test_small":
             # Test 1: Small tensor
-            a = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32, device='cuda')
-            b = torch.tensor([10.0, 20.0, 30.0, 40.0, 50.0], dtype=torch.float32, device='cuda')
+            a = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32, device="cuda")
+            b = torch.tensor([10.0, 20.0, 30.0, 40.0, 50.0], dtype=torch.float32, device="cuda")
             c = torch.empty_like(a)
-            
+
             mod.elementwise_add(a, b, c)
             expected = a + b
-            
+
             torch.testing.assert_close(c, expected)
             print(f"  ✓ {test_name} passed")
             return True
-            
+
         elif test_name == "test_large":
             # Test 2: Larger tensor
             n = 10000
-            a = torch.randn(n, dtype=torch.float32, device='cuda')
-            b = torch.randn(n, dtype=torch.float32, device='cuda')
+            a = torch.randn(n, dtype=torch.float32, device="cuda")
+            b = torch.randn(n, dtype=torch.float32, device="cuda")
             c = torch.empty_like(a)
-            
+
             mod.elementwise_add(a, b, c)
             expected = a + b
-            
+
             torch.testing.assert_close(c, expected)
             print(f"  ✓ {test_name} passed")
             return True
-            
+
     except Exception as e:
         print(f"  ✗ {test_name} failed: {e}")
         return False
@@ -151,12 +136,12 @@ def test_model(model_name: str, output_dir: Path):
     print(f"\n{'='*80}")
     print(f"Testing model: {model_name}")
     print(f"{'='*80}")
-    
+
     try:
         config = get_model_config(model_name)
-        
+
         full_prompt = ELEMENTWISE_ADD_PROMPT + "\n\n" + FFI_PROMPT_SIMPLE
-        
+
         print(f"Calling {config['provider']} API...")
         if config["provider"] == "openai":
             response = call_openai_model(config["model"], config["api_key"], full_prompt)
@@ -164,120 +149,113 @@ def test_model(model_name: str, output_dir: Path):
             response = call_anthropic_model(config["model"], config["api_key"], full_prompt)
         else:
             raise ValueError(f"Unknown provider: {config['provider']}")
-        
+
         print(f"Received response from {model_name}")
-        
+
         cuda_code = extract_cuda_code(response)
         print(f"Extracted {len(cuda_code)} characters of code")
-        
+
         output_file = output_dir / f"{model_name}.txt"
-        with open(output_file, 'w') as f:
+        with open(output_file, "w") as f:
             f.write(f"Model: {model_name}\n")
             f.write(f"{'='*80}\n\n")
             f.write("Generated Code:\n")
-            f.write("="*80 + "\n")
+            f.write("=" * 80 + "\n")
             f.write(cuda_code)
             f.write("\n\n")
-        
+
         print(f"Saved response to {output_file}")
-        
+
         try:
             mod: Module = tvm_ffi.cpp.load_inline(
-                name=f'elementwise_add_{model_name.replace("-", "_")}',
-                cuda_sources=cuda_code
+                name=f'elementwise_add_{model_name.replace("-", "_")}', cuda_sources=cuda_code
             )
             print("Compilation successful!")
-            
+
             # Run tests
             print("\nRunning tests...")
             test1_passed = test_kernel(mod, "test_small")
             test2_passed = test_kernel(mod, "test_large")
-            
-            with open(output_file, 'a') as f:
+
+            with open(output_file, "a") as f:
                 f.write("Test Results:\n")
-                f.write("="*80 + "\n")
+                f.write("=" * 80 + "\n")
                 f.write(f"Compilation: SUCCESS\n")
                 f.write(f"Test 1 (small tensor): {'PASS' if test1_passed else 'FAIL'}\n")
                 f.write(f"Test 2 (large tensor): {'PASS' if test2_passed else 'FAIL'}\n")
-            
+
             all_passed = test1_passed and test2_passed
             status = "ALL TESTS PASSED" if all_passed else "TESTS FAILED"
             print(f"\n{status}")
-            
+
             return {
                 "model": model_name,
                 "compilation": "success",
                 "test_small": test1_passed,
                 "test_large": test2_passed,
-                "all_passed": all_passed
+                "all_passed": all_passed,
             }
-            
+
         except Exception as e:
             print(f"Compilation failed: {e}")
-            with open(output_file, 'a') as f:
+            with open(output_file, "a") as f:
                 f.write("Test Results:\n")
-                f.write("="*80 + "\n")
+                f.write("=" * 80 + "\n")
                 f.write(f"Compilation: FAILED\n")
                 f.write(f"Error: {str(e)}\n")
-            
+
             return {
                 "model": model_name,
                 "compilation": "failed",
                 "error": str(e),
-                "all_passed": False
+                "all_passed": False,
             }
-            
+
     except Exception as e:
         print(f"Error testing {model_name}: {e}")
-        return {
-            "model": model_name,
-            "compilation": "error",
-            "error": str(e),
-            "all_passed": False
-        }
+        return {"model": model_name, "compilation": "error", "error": str(e), "all_passed": False}
 
 
-def main():    
+def main():
     models = [
         "gpt-5-2025-08-07",
         "o3",
         "claude-opus-4-1-20250805",
         "claude-sonnet-4-5-20250805",
         "gpt-5-mini-2025-08-07",
-        "o4-mini-2025-04-16"
+        "o4-mini-2025-04-16",
     ]
-    
+
     output_dir = Path(__file__).parent / "test_results"
     output_dir.mkdir(exist_ok=True)
-    
-    print("="*80)
+
+    print("=" * 80)
     print("Testing LLM FFI Code Generation")
-    print("="*80)
+    print("=" * 80)
     print(f"Models to test: {len(models)}")
     print(f"Output directory: {output_dir}")
-    
+
     results = []
-    
+
     for idx, model_name in enumerate(models, 1):
         print(f"\n[{idx}/{len(models)}] Testing {model_name}...")
         result = test_model(model_name, output_dir)
         results.append(result)
-    
+
     print(f"\n{'='*80}")
     print("FINAL SUMMARY")
     print(f"{'='*80}")
-    
+
     for result in results:
-        model = result['model']
-        status = "✓ PASS" if result.get('all_passed', False) else "✗ FAIL"
+        model = result["model"]
+        status = "✓ PASS" if result.get("all_passed", False) else "✗ FAIL"
         print(f"{model:30} {status}")
-        if 'error' in result:
+        if "error" in result:
             print(f"  Error: {result['error'][:100]}")
-    
-    total_passed = sum(1 for r in results if r.get('all_passed', False))
+
+    total_passed = sum(1 for r in results if r.get("all_passed", False))
     print(f"\nTotal: {total_passed}/{len(models)} models passed all tests")
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt b/flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt
index 83dd087..dcae1da 100644
--- a/flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt
+++ b/flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt
@@ -23,18 +23,18 @@ void ElementwiseAdd(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::Te
   DLDevice dev_a = a.device();
   DLDevice dev_b = b.device();
   DLDevice dev_c = c.device();
-  
+
   if (dev_a.device_type != dev_b.device_type || dev_a.device_id != dev_b.device_id ||
       dev_a.device_type != dev_c.device_type || dev_a.device_id != dev_c.device_id) {
     TVM_FFI_THROW(ValueError) << "All tensors must be on the same device";
   }
-  
+
   // Check that we're on CUDA device
   if (dev_a.device_type != kDLCUDA) {
-    TVM_FFI_THROW(RuntimeError) << "This kernel only supports CUDA devices, got device type: " 
+    TVM_FFI_THROW(RuntimeError) << "This kernel only supports CUDA devices, got device type: "
                                  << dev_a.device_type;
   }
-  
+
   // Validate tensor dimensions - must be 1D
   if (a.ndim() != 1) {
     TVM_FFI_THROW(ValueError) << "Input tensor 'a' must be 1D, got " << a.ndim() << "D";
@@ -45,7 +45,7 @@ void ElementwiseAdd(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::Te
   if (c.ndim() != 1) {
     TVM_FFI_THROW(ValueError) << "Output tensor 'c' must be 1D, got " << c.ndim() << "D";
   }
-  
+
   // Validate tensor shapes match
   int64_t n = a.size(0);
   if (b.size(0) != n) {
@@ -56,12 +56,12 @@ void ElementwiseAdd(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::Te
     TVM_FFI_THROW(ValueError) << "Output tensor size must match input size. "
                                << "Expected " << n << ", got " << c.size(0);
   }
-  
+
   // Validate data types - must be float32
   DLDataType dtype_a = a.dtype();
   DLDataType dtype_b = b.dtype();
   DLDataType dtype_c = c.dtype();
-  
+
   if (dtype_a.code != kDLFloat || dtype_a.bits != 32 || dtype_a.lanes != 1) {
     TVM_FFI_THROW(TypeError) << "Input tensor 'a' must be float32";
   }
@@ -71,7 +71,7 @@ void ElementwiseAdd(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::Te
   if (dtype_c.code != kDLFloat || dtype_c.bits != 32 || dtype_c.lanes != 1) {
     TVM_FFI_THROW(TypeError) << "Output tensor 'c' must be float32";
   }
-  
+
   // Validate tensors are contiguous
   if (!a.IsContiguous()) {
     TVM_FFI_THROW(ValueError) << "Input tensor 'a' must be contiguous";
@@ -82,29 +82,29 @@ void ElementwiseAdd(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::Te
   if (!c.IsContiguous()) {
     TVM_FFI_THROW(ValueError) << "Output tensor 'c' must be contiguous";
   }
-  
+
   // Get data pointers
   const float* a_data = static_cast<const float*>(a.data_ptr());
   const float* b_data = static_cast<const float*>(b.data_ptr());
   float* c_data = static_cast<float*>(c.data_ptr());
-  
+
   // Handle empty tensor case
   if (n == 0) {
     return;
   }
-  
+
   // Get CUDA stream from environment
   cudaStream_t stream = static_cast<cudaStream_t>(
       TVMFFIEnvGetStream(dev_a.device_type, dev_a.device_id));
-  
+
   // Configure kernel launch parameters
   const int64_t threads_per_block = 256;
   int64_t num_blocks = (n + threads_per_block - 1) / threads_per_block;
-  
+
   // Launch kernel
   ElementwiseAddKernel<<<num_blocks, threads_per_block, 0, stream>>>(
       a_data, b_data, c_data, n);
-  
+
   // Check for kernel launch errors
   cudaError_t err = cudaGetLastError();
   if (err != cudaSuccess) {

From 365536ee226f86ab4d13a658ae993ecaa5c8a43a Mon Sep 17 00:00:00 2001
From: zanderjiang <zander.jiang@gmail.com>
Date: Mon, 17 Nov 2025 09:11:05 -0500
Subject: [PATCH 03/10] full ffi prompt

---
 flashinfer_bench/agents/ffi_prompt.py         | 1869 +++++++++++++++++
 .../{ffi => agents}/load_inline.py            |    0
 .../test_existing_prompt_sol.py               |    0
 .../{ffi => agents}/test_prompt.py            |    0
 .../test_results/claude-opus-4-1-20250805.txt |    0
 .../test_results/gpt-5-2025-08-07.txt         |    0
 .../test_results/gpt-5-mini-2025-08-07.txt    |    0
 .../{ffi => agents}/test_results/o3.txt       |    0
 .../test_results/o4-mini-2025-04-16.txt       |    0
 flashinfer_bench/agents/torch_load_inline.py  |   87 +
 flashinfer_bench/ffi/ffi_prompt.py            |  647 ------
 11 files changed, 1956 insertions(+), 647 deletions(-)
 create mode 100644 flashinfer_bench/agents/ffi_prompt.py
 rename flashinfer_bench/{ffi => agents}/load_inline.py (100%)
 rename flashinfer_bench/{ffi => agents}/test_existing_prompt_sol.py (100%)
 rename flashinfer_bench/{ffi => agents}/test_prompt.py (100%)
 rename flashinfer_bench/{ffi => agents}/test_results/claude-opus-4-1-20250805.txt (100%)
 rename flashinfer_bench/{ffi => agents}/test_results/gpt-5-2025-08-07.txt (100%)
 rename flashinfer_bench/{ffi => agents}/test_results/gpt-5-mini-2025-08-07.txt (100%)
 rename flashinfer_bench/{ffi => agents}/test_results/o3.txt (100%)
 rename flashinfer_bench/{ffi => agents}/test_results/o4-mini-2025-04-16.txt (100%)
 create mode 100644 flashinfer_bench/agents/torch_load_inline.py
 delete mode 100644 flashinfer_bench/ffi/ffi_prompt.py

diff --git a/flashinfer_bench/agents/ffi_prompt.py b/flashinfer_bench/agents/ffi_prompt.py
new file mode 100644
index 0000000..ff09517
--- /dev/null
+++ b/flashinfer_bench/agents/ffi_prompt.py
@@ -0,0 +1,1869 @@
+FFI_PROMPT_SIMPLE = """
+Use TVM FFI format for your generated kernel host function and bindings
+
+Use the following headers:
+#include <tvm/ffi/container/tensor.h>   // TensorView: tensor arguments
+#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
+#include <tvm/ffi/error.h>              // TVM_FFI_ICHECK, TVM_FFI_THROW
+
+Include when using CUDA / env-managed streams or allocators:
+#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream, TVMFFIEnvTensorAlloc
+
+# TVM FFI API Reference
+
+## TensorView (tvm/ffi/container/tensor.h)
+
+**Purpose:** Non-owning view of tensor data. Use `tvm::ffi::TensorView` for function parameters.
+
+### Essential Methods
+```cpp
+void* data_ptr() const               // Raw data pointer
+DLDevice device() const              // Device info (.device_type, .device_id)
+DLDataType dtype() const             // Data type (.code, .bits, .lanes)
+int32_t ndim() const                 // Number of dimensions
+int64_t size(int64_t idx) const      // Size of dimension idx (negative idx: count from end)
+int64_t numel() const                // Total number of elements
+bool IsContiguous() const            // Check if memory is contiguous
+```
+
+### ShapeView Access
+```cpp
+ShapeView shape() const              // Get shape (array-like)
+ShapeView strides() const            // Get strides (array-like)
+
+// ShapeView can be indexed like an array:
+// tensor.shape()[0], tensor.shape()[1], etc.
+```
+
+### Device Type Constants (DLDevice.device_type)
+```cpp
+kDLCPU = 1          // CPU
+kDLCUDA = 2         // CUDA GPU
+kDLCUDAHost = 3     // CUDA pinned memory
+kDLROCM = 10        // ROCm/HIP
+```
+
+### Data Type Constants (DLDataType)
+```cpp
+// DLDataType has: uint8_t code, uint8_t bits, uint16_t lanes
+// Common .code values:
+kDLInt = 0          // Signed integer
+kDLUInt = 1         // Unsigned integer
+kDLFloat = 2        // IEEE floating point
+kDLBfloat = 4       // Brain floating point
+
+// Example: float32 has code=2 (kDLFloat), bits=32, lanes=1
+// Example: half/fp16 has code=2, bits=16, lanes=1
+```
+
+## Function Export (tvm/ffi/function.h)
+
+### Export Macro
+```cpp
+// Use this to export your C++ function for FFI
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(export_name, cpp_function)
+
+// Example:
+void MyKernel(tvm::ffi::TensorView a, tvm::ffi::TensorView b) { ... }
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(my_kernel, MyKernel);
+```
+
+**Supported function signatures:**
+- `void func(TensorView t1, TensorView t2, ...)`
+- `void func(TensorView t, int64_t size, float alpha, ...)`
+- `int64_t func(TensorView t)`
+- Any combination of: `TensorView`, `int32_t`, `int64_t`, `float`, `double`, `bool`, `std::string`
+
+## Error Handling (tvm/ffi/error.h)
+
+### Throwing Errors
+```cpp
+// Throw with custom error kind
+TVM_FFI_THROW(ValueError) << "Invalid input: " << x;
+TVM_FFI_THROW(RuntimeError) << "CUDA error: " << cudaGetErrorString(err);
+TVM_FFI_THROW(TypeError) << "Expected float32, got int32";
+```
+
+### Assertions (for internal logic errors)
+```cpp
+TVM_FFI_ICHECK(condition) << "message"           // General check
+TVM_FFI_ICHECK_EQ(x, y) << "x must equal y"      // x == y
+TVM_FFI_ICHECK_NE(x, y) << "x must not equal y"  // x != y
+TVM_FFI_ICHECK_LT(x, y) << "x must be less than y"     // x < y
+TVM_FFI_ICHECK_LE(x, y) << "x must be at most y"       // x <= y
+TVM_FFI_ICHECK_GT(x, y) << "x must be greater than y"  // x > y
+TVM_FFI_ICHECK_GE(x, y) << "x must be at least y"      // x >= y
+```
+
+### User Input Validation (use TVM_FFI_THROW instead)
+```cpp
+// For user-facing errors, use TVM_FFI_THROW with appropriate error kind:
+if (x.ndim() != 2) {
+  TVM_FFI_THROW(ValueError) << "Expected 2D tensor, got " << x.ndim() << "D";
+}
+if (x.dtype().code != kDLFloat || x.dtype().bits != 32) {
+  TVM_FFI_THROW(TypeError) << "Expected float32 dtype";
+}
+```
+
+## CUDA Stream Management (tvm/ffi/extra/c_env_api.h)
+
+```cpp
+// Get the current CUDA stream for a device
+TVMFFIStreamHandle TVMFFIEnvGetStream(int32_t device_type, int32_t device_id)
+
+// Usage:
+DLDevice dev = tensor.device();
+cudaStream_t stream = static_cast<cudaStream_t>(
+    TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+
+// Launch kernel on the stream:
+my_kernel<<<blocks, threads, 0, stream>>>(...);
+```
+
+Example Usage of FFI binding:
+```cpp
+// File: add_one_cuda.cu
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/error.h>
+
+namespace my_kernels {
+
+__global__ void AddOneKernel(float* x, float* y, int n) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = x[idx] + 1.0f;
+  }
+}
+
+void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
+  // Input validation
+  TVM_FFI_ICHECK_EQ(x.ndim(), 1) << "x must be 1D";
+  TVM_FFI_ICHECK_EQ(y.ndim(), 1) << "y must be 1D";
+  TVM_FFI_ICHECK_EQ(x.size(0), y.size(0)) << "Shape mismatch";
+
+  // Get data pointers
+  float* x_data = static_cast<float*>(x.data_ptr());
+  float* y_data = static_cast<float*>(y.data_ptr());
+  int64_t n = x.size(0);
+
+  // Get CUDA stream from environment
+  DLDevice dev = x.device();
+  cudaStream_t stream = static_cast<cudaStream_t>(
+      TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+
+  // Launch kernel
+  int64_t threads = 256;
+  int64_t blocks = (n + threads - 1) / threads;
+  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
+}
+
+// Export the function with name "add_one_cuda"
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cuda, AddOne);
+
+}  // namespace my_kernels
+```
+"""
+
+FFI_FULL_PROMPT = """
+# TVM FFI Complete API Reference
+
+This guide provides comprehensive documentation for writing CUDA kernels with TVM FFI bindings.
+
+## Required Headers
+
+```cpp
+#include <tvm/ffi/container/tensor.h>   // Tensor, TensorView
+#include <tvm/ffi/container/array.h>    // Array<T>
+#include <tvm/ffi/container/map.h>      // Map<K, V>
+#include <tvm/ffi/container/tuple.h>    // Tuple<...>
+#include <tvm/ffi/container/shape.h>    // Shape, ShapeView
+#include <tvm/ffi/container/variant.h>  // Variant<...>
+#include <tvm/ffi/string.h>             // String
+#include <tvm/ffi/dtype.h>              // DLDataType utilities
+#include <tvm/ffi/function.h>           // Function export macros
+#include <tvm/ffi/error.h>              // Error handling
+#include <tvm/ffi/extra/c_env_api.h>    // CUDA stream management
+#include <tvm/ffi/extra/module.h>       // Module loading
+```
+
+---
+
+# Part 1: Data Container Types
+
+## 1.1 Tensor & TensorView (tvm/ffi/container/tensor.h)
+
+### TensorView - Lightweight Non-Owning View
+
+**Purpose:** Non-owning view of tensor data. Always use `tvm::ffi::TensorView` for function parameters.
+
+**Constructors:**
+```cpp
+TensorView(const Tensor& tensor)          // From Tensor
+TensorView(const DLTensor* tensor)        // From DLTensor pointer
+```
+
+**Essential Methods:**
+```cpp
+// Data access
+void* data_ptr() const                    // Raw data pointer
+int64_t byte_offset() const               // Byte offset from base pointer
+
+// Device information
+DLDevice device() const                   // Returns device (.device_type, .device_id)
+
+// Type information
+DLDataType dtype() const                  // Returns data type (.code, .bits, .lanes)
+
+// Dimension information
+int32_t ndim() const                      // Number of dimensions
+int64_t size(int64_t idx) const          // Size of dimension idx (supports negative indexing)
+int64_t stride(int64_t idx) const        // Stride of dimension idx (supports negative indexing)
+int64_t numel() const                    // Total number of elements
+
+// Shape and strides access
+ShapeView shape() const                   // Get shape (array-like, indexable: shape()[0], shape()[1], ...)
+ShapeView strides() const                 // Get strides (array-like)
+
+// Memory properties
+bool IsContiguous() const                 // Check if memory layout is contiguous
+bool IsAligned(size_t alignment) const   // Check if data pointer is aligned
+```
+
+**Usage Example:**
+```cpp
+void MyKernel(tvm::ffi::TensorView input, tvm::ffi::TensorView output) {
+  // Get dimensions
+  int64_t batch = input.size(0);
+  int64_t channels = input.size(1);
+  int64_t height = input.size(2);
+  int64_t width = input.size(3);
+  
+  // Get data pointer
+  float* in_data = static_cast<float*>(input.data_ptr());
+  float* out_data = static_cast<float*>(output.data_ptr());
+  
+  // Check device
+  DLDevice dev = input.device();
+  if (dev.device_type == kDLCUDA) {
+    // CUDA-specific code
+  }
+}
+```
+
+### Tensor - Managed Owning Container
+
+**Purpose:** Reference-counted tensor with memory ownership. Use when you need to allocate or store tensors.
+
+**All TensorView methods** (Tensor has same interface as TensorView)
+
+**Static Factory Methods:**
+```cpp
+// From DLPack
+static Tensor FromDLPack(DLManagedTensor* tensor, 
+                         size_t require_alignment = 0, 
+                         bool require_contiguous = false)
+
+static Tensor FromDLPackVersioned(DLManagedTensorVersioned* tensor,
+                                   size_t require_alignment = 0,
+                                   bool require_contiguous = false)
+
+// Allocate from environment (recommended for kernel outputs)
+static Tensor FromEnvAlloc(int (*env_alloc)(DLTensor*, TVMFFIObjectHandle*),
+                          ffi::ShapeView shape, 
+                          DLDataType dtype, 
+                          DLDevice device)
+
+// Example: Allocate tensor using environment allocator
+ffi::Tensor output = ffi::Tensor::FromEnvAlloc(
+    TVMFFIEnvTensorAlloc, shape, dtype, device);
+```
+
+**Conversion Methods:**
+```cpp
+DLManagedTensor* ToDLPack() const                 // Export to DLPack
+DLManagedTensorVersioned* ToDLPackVersioned() const
+const DLTensor* GetDLTensorPtr() const            // Get underlying DLTensor
+```
+
+### Utility Functions
+```cpp
+// Check if tensor is contiguous
+bool IsContiguous(const DLTensor& arr)
+
+// Check alignment
+bool IsAligned(const DLTensor& arr, size_t alignment)
+
+// Check if device supports direct addressing (CPU, CUDA, etc.)
+bool IsDirectAddressDevice(const DLDevice& device)
+
+// Calculate data size in bytes
+size_t GetDataSize(size_t numel, DLDataType dtype)
+size_t GetDataSize(const DLTensor& arr)
+size_t GetDataSize(const Tensor& tensor)
+size_t GetDataSize(const TensorView& tensor)
+```
+
+### Device Type Constants
+```cpp
+kDLCPU = 1          // CPU
+kDLCUDA = 2         // CUDA GPU
+kDLCUDAHost = 3     // CUDA pinned memory
+kDLROCM = 10        // ROCm/HIP
+kDLMetal = 8        // Metal (Apple)
+kDLVulkan = 7       // Vulkan
+```
+
+### Data Type Constants (DLDataType)
+```cpp
+// DLDataType structure: { uint8_t code, uint8_t bits, uint16_t lanes }
+
+// Type codes (code field)
+kDLInt = 0          // Signed integer
+kDLUInt = 1         // Unsigned integer
+kDLFloat = 2        // IEEE floating point
+kDLBfloat = 4       // Brain floating point
+kDLBool = 5         // Boolean
+kDLFloat8_e4m3fn = 6    // FP8 E4M3
+kDLFloat8_e5m2 = 7      // FP8 E5M2
+// ... and more FP8 variants
+
+// Common data types (examples):
+// float32:  code=kDLFloat, bits=32, lanes=1
+// float16:  code=kDLFloat, bits=16, lanes=1
+// int32:    code=kDLInt,   bits=32, lanes=1
+// uint8:    code=kDLUInt,  bits=8,  lanes=1
+// bfloat16: code=kDLBfloat, bits=16, lanes=1
+```
+
+---
+
+## 1.2 Array<T> (tvm/ffi/container/array.h)
+
+**Purpose:** Dynamic array container with copy-on-write semantics. Similar to `std::vector` but FFI-compatible.
+
+**Type Parameter:** `T` must be compatible with `tvm::ffi::Any` (ObjectRef types, primitives via TypeTraits)
+
+**Constructors:**
+```cpp
+Array<T>()                                      // Empty array
+Array<T>(size_t n, const T& val)               // n copies of val
+Array<T>(std::initializer_list<T> init)        // From initializer list
+Array<T>(const std::vector<T>& vec)            // From std::vector
+Array<T>(IterType first, IterType last)        // From iterator range
+```
+
+**Element Access:**
+```cpp
+T operator[](int64_t i) const                   // Read i-th element
+T front() const                                 // First element
+T back() const                                  // Last element
+T at(int64_t i) const                          // Bounds-checked access
+```
+
+**Capacity:**
+```cpp
+size_t size() const                             // Number of elements
+size_t capacity() const                         // Allocated capacity
+bool empty() const                              // Check if empty
+void reserve(int64_t n)                         // Reserve capacity
+void resize(int64_t n)                          // Resize array
+```
+
+**Modifiers (Copy-on-Write):**
+```cpp
+void push_back(const T& item)                   // Add element to end
+void emplace_back(Args&&... args)               // Construct element in-place
+void pop_back()                                 // Remove last element
+void insert(iterator pos, const T& val)         // Insert at position
+void erase(iterator pos)                        // Remove at position
+void erase(iterator first, iterator last)       // Remove range
+void clear()                                    // Remove all elements
+void Set(int64_t i, T value)                   // Set i-th element
+```
+
+**Iterators:**
+```cpp
+iterator begin() const                          // Begin iterator
+iterator end() const                            // End iterator
+reverse_iterator rbegin() const                 // Reverse begin
+reverse_iterator rend() const                   // Reverse end
+```
+
+**Functional Operations:**
+```cpp
+// Map function over array
+template<typename F>
+Array<U> Map(F fmap) const                      // Returns Array<U>
+
+// Mutate array in-place (if unique owner)
+template<typename F>
+void MutateByApply(F fmutate)                   // F: T -> T
+```
+
+**Static Methods:**
+```cpp
+// Aggregate multiple arrays/elements
+template<typename... Args>
+static Array<T> Agregate(Args... args)          // Combine T and Array<T> args
+```
+
+**Usage Example:**
+```cpp
+// Create array
+Array<int64_t> dims = {1, 3, 224, 224};
+
+// Access elements
+int64_t batch = dims[0];
+
+// Modify (copy-on-write)
+dims.push_back(512);
+dims.Set(0, 2);
+
+// Map operation
+Array<int64_t> doubled = dims.Map([](int64_t x) { return x * 2; });
+
+// Iterate
+for (int64_t d : dims) {
+  // process d
+}
+```
+
+---
+
+## 1.3 Map<K, V> (tvm/ffi/container/map.h)
+
+**Purpose:** Hash map container with copy-on-write semantics. Similar to `std::unordered_map`.
+
+**Type Parameters:** 
+- `K` (key type) must be hashable and compatible with Any
+- `V` (value type) must be compatible with Any
+
+**Constructors:**
+```cpp
+Map<K, V>()                                     // Empty map
+Map<K, V>(std::initializer_list<std::pair<K, V>> init)
+Map<K, V>(IterType first, IterType last)       // From iterator range
+Map<K, V>(const std::unordered_map<K, V>& map) // From std::unordered_map
+```
+
+**Element Access:**
+```cpp
+V at(const K& key) const                        // Throws if key not found
+V operator[](const K& key) const                // Throws if key not found
+Optional<V> Get(const K& key) const            // Returns Optional (nullopt if not found)
+V Get(const K& key, const V& default_value) const // Returns default if not found
+```
+
+**Capacity:**
+```cpp
+size_t size() const                             // Number of elements
+size_t count(const K& key) const               // 0 or 1 (check if key exists)
+bool empty() const                              // Check if empty
+```
+
+**Modifiers (Copy-on-Write):**
+```cpp
+void Set(const K& key, const V& value)          // Insert or update
+void erase(const K& key)                        // Remove key
+void clear()                                    // Remove all elements
+```
+
+**Iterators:**
+```cpp
+iterator begin() const                          // Begin iterator
+iterator end() const                            // End iterator
+
+// Iterator dereference returns std::pair<K, V>
+for (auto kv : map) {
+  K key = kv.first;
+  V value = kv.second;
+}
+```
+
+**Static Methods:**
+```cpp
+// Construct map from keys and values
+static Map<K, V> FromItems(const Array<std::pair<K, V>>& items)
+```
+
+**Usage Example:**
+```cpp
+// Create map
+Map<String, int64_t> config = {
+  {"batch_size", 32},
+  {"num_layers", 12}
+};
+
+// Access
+int64_t batch = config["batch_size"];
+Optional<int64_t> opt = config.Get("hidden_dim");
+if (opt.defined()) {
+  int64_t hidden = opt.value();
+}
+
+// Modify
+config.Set("hidden_dim", 768);
+config.erase("num_layers");
+
+// Iterate
+for (auto kv : config) {
+  String key = kv.first;
+  int64_t value = kv.second;
+}
+```
+
+---
+
+## 1.4 Tuple<T...> (tvm/ffi/container/tuple.h)
+
+**Purpose:** Fixed-size heterogeneous container. Similar to `std::tuple` but FFI-compatible.
+
+**Constructors:**
+```cpp
+Tuple<T1, T2, ...>()                            // Default construct
+Tuple<T1, T2, ...>(T1 v1, T2 v2, ...)          // From values
+```
+
+**Element Access:**
+```cpp
+// Compile-time index access
+std::get<0>(tuple)                              // Get first element
+std::get<1>(tuple)                              // Get second element
+
+// Runtime index access (returns Any)
+Any operator[](int64_t idx) const               // Get element at runtime index
+```
+
+**Capacity:**
+```cpp
+static constexpr size_t size()                  // Number of elements (compile-time)
+```
+
+**Structured Binding:**
+```cpp
+auto [a, b, c] = tuple;                         // C++17 structured binding
+```
+
+**Usage Example:**
+```cpp
+// Create tuple
+Tuple<int64_t, float, String> result = {42, 3.14f, "success"};
+
+// Access elements
+int64_t code = std::get<0>(result);
+float value = std::get<1>(result);
+String msg = std::get<2>(result);
+
+// Or with structured binding
+auto [code, value, msg] = result;
+```
+
+---
+
+## 1.5 Shape & ShapeView (tvm/ffi/container/shape.h)
+
+### ShapeView - Lightweight Non-Owning View
+
+**Purpose:** Non-owning view of shape dimensions. Use for passing shapes without allocation.
+
+**Constructors:**
+```cpp
+ShapeView()                                     // Empty
+ShapeView(const int64_t* data, size_t size)    // From pointer and size
+ShapeView(std::initializer_list<int64_t> init) // From initializer list
+```
+
+**Element Access:**
+```cpp
+int64_t operator[](size_t idx) const           // Access dimension
+int64_t at(size_t idx) const                   // Bounds-checked access
+int64_t front() const                          // First dimension
+int64_t back() const                           // Last dimension
+```
+
+**Properties:**
+```cpp
+const int64_t* data() const                    // Data pointer
+size_t size() const                            // Number of dimensions
+bool empty() const                             // Check if empty
+int64_t Product() const                        // Product of all dimensions
+```
+
+**Iterators:**
+```cpp
+const int64_t* begin() const
+const int64_t* end() const
+```
+
+### Shape - Managed Owning Container
+
+**Purpose:** Reference-counted shape with memory ownership.
+
+**Constructors:**
+```cpp
+Shape()                                        // Empty
+Shape(std::initializer_list<int64_t> init)
+Shape(std::vector<int64_t> vec)
+Shape(Array<int64_t> arr)
+Shape(ShapeView view)
+Shape(IterType first, IterType last)
+```
+
+**All ShapeView methods** (same interface)
+
+**Static Factory:**
+```cpp
+// Create strides from shape (row-major)
+static Shape StridesFromShape(ShapeView shape)
+```
+
+**Conversion:**
+```cpp
+operator ShapeView() const                     // Implicit conversion to ShapeView
+```
+
+**Usage Example:**
+```cpp
+// Create shape
+Shape shape = {2, 3, 224, 224};  // NCHW
+
+// Access
+int64_t batch = shape[0];
+int64_t channels = shape[1];
+
+// Calculate total elements
+int64_t numel = shape.Product();  // 2 * 3 * 224 * 224
+
+// Create strides
+Shape strides = Shape::StridesFromShape(shape);
+// strides = {3*224*224, 224*224, 224, 1}
+```
+
+---
+
+## 1.6 Variant<V...> (tvm/ffi/container/variant.h)
+
+**Purpose:** Type-safe union that can hold one of several types. Similar to `std::variant`.
+
+**Constructors:**
+```cpp
+Variant<T1, T2, ...>(T1 val)                   // Construct with T1
+Variant<T1, T2, ...>(T2 val)                   // Construct with T2
+```
+
+**Type Checking and Casting:**
+```cpp
+// Try cast (returns std::optional)
+template<typename T>
+std::optional<T> as() const                     // Returns nullopt if wrong type
+
+// Cast (throws on failure)
+template<typename T>
+T get() const&                                  // Copy value
+T get() &&                                      // Move value
+
+// Get type information
+std::string GetTypeKey() const                  // Get type key string
+```
+
+**Usage Example:**
+```cpp
+using Value = Variant<int64_t, float, String>;
+
+Value v1 = 42;
+Value v2 = 3.14f;
+Value v3 = String("hello");
+
+// Check and extract
+if (auto opt_int = v1.as<int64_t>()) {
+  int64_t val = *opt_int;
+}
+
+// Or direct cast (throws if wrong type)
+int64_t val = v1.get<int64_t>();
+```
+
+---
+
+## 1.7 String (tvm/ffi/string.h)
+
+**Purpose:** FFI-compatible string with small-string optimization.
+
+**Constructors:**
+```cpp
+String()                                       // Empty string
+String(const char* str)
+String(const std::string& str)
+String(std::string&& str)                      // Move from std::string
+String(const char* data, size_t size)
+```
+
+**Properties:**
+```cpp
+const char* data() const                       // Data pointer
+const char* c_str() const                      // Null-terminated C string
+size_t size() const                            // Length
+size_t length() const                          // Length (same as size)
+bool empty() const                             // Check if empty
+```
+
+**Element Access:**
+```cpp
+char at(size_t pos) const                      // Bounds-checked access
+```
+
+**Comparison:**
+```cpp
+int compare(const String& other) const
+int compare(const std::string& other) const
+int compare(const char* other) const
+
+// Operators: ==, !=, <, >, <=, >=
+```
+
+**Concatenation:**
+```cpp
+String operator+(const String& lhs, const String& rhs)
+String operator+(const String& lhs, const char* rhs)
+// ... and other combinations
+```
+
+**Conversion:**
+```cpp
+operator std::string() const                   // Convert to std::string
+```
+
+**Utility:**
+```cpp
+String EscapeString(const String& value)       // Escape for JSON/C++
+```
+
+---
+
+# Part 2: Data Type Utilities (tvm/ffi/dtype.h)
+
+## DLDataType Structure
+
+```cpp
+struct DLDataType {
+  uint8_t code;      // Type code (kDLInt, kDLFloat, etc.)
+  uint8_t bits;      // Number of bits
+  uint16_t lanes;    // Number of lanes (for vector types)
+};
+```
+
+## Type Code Constants
+
+```cpp
+kDLInt = 0          // Signed integer
+kDLUInt = 1         // Unsigned integer
+kDLFloat = 2        // IEEE floating point
+kDLOpaqueHandle = 3 // Opaque handle
+kDLBfloat = 4       // Brain floating point
+kDLBool = 5         // Boolean
+kDLFloat8_e4m3fn = 6
+kDLFloat8_e5m2 = 7
+// ... many more FP8 variants
+```
+
+## Conversion Functions
+
+```cpp
+// String to DLDataType
+DLDataType StringToDLDataType(const String& str)
+// Examples: "float32", "int64", "float16", "uint8"
+
+// DLDataType to String  
+String DLDataTypeToString(DLDataType dtype)
+
+// Get type code as string (internal)
+const char* DLDataTypeCodeAsCStr(DLDataTypeCode type_code)
+```
+
+## Operators
+
+```cpp
+// Comparison
+bool operator==(const DLDataType& lhs, const DLDataType& rhs)
+bool operator!=(const DLDataType& lhs, const DLDataType& rhs)
+
+// Output stream
+std::ostream& operator<<(std::ostream& os, DLDataType dtype)
+```
+
+**Usage Example:**
+```cpp
+// Check dtype
+DLDataType dt = tensor.dtype();
+if (dt.code == kDLFloat && dt.bits == 32) {
+  // float32 tensor
+}
+
+// Convert from string
+DLDataType float16_dtype = StringToDLDataType("float16");
+
+// Convert to string
+String dtype_str = DLDataTypeToString(dt);  // "float32"
+```
+
+---
+
+# Part 3: Error Handling (tvm/ffi/error.h)
+
+## Error Class
+
+```cpp
+class Error : public std::exception {
+public:
+  Error(std::string kind, std::string message, std::string backtrace);
+  
+  std::string kind() const;                    // Error category
+  std::string message() const;                 // Error message
+  std::string backtrace() const;               // Stack trace
+  std::string TracebackMostRecentCallLast() const; // Python-style traceback
+  const char* what() const noexcept override;  // Standard exception interface
+};
+```
+
+## Throwing Errors
+
+```cpp
+// Throw with automatic backtrace
+TVM_FFI_THROW(ErrorKind) << "message" << variable << "more text"
+
+// Common error kinds:
+TVM_FFI_THROW(ValueError) << "Invalid value: " << x;
+TVM_FFI_THROW(TypeError) << "Expected float32, got " << dtype;
+TVM_FFI_THROW(RuntimeError) << "CUDA error: " << error_string;
+TVM_FFI_THROW(InternalError) << "Unexpected condition";
+TVM_FFI_THROW(IndexError) << "Index " << i << " out of bounds";
+
+// Log to stderr and throw (for startup functions)
+TVM_FFI_LOG_AND_THROW(ErrorKind) << "message";
+```
+
+## Assertions (Internal Logic Checks)
+
+```cpp
+// General check
+TVM_FFI_ICHECK(condition) << "message"
+
+// Comparison checks (more informative error messages)
+TVM_FFI_ICHECK_EQ(x, y) << "x must equal y"              // x == y
+TVM_FFI_ICHECK_NE(x, y) << "x must not equal y"          // x != y
+TVM_FFI_ICHECK_LT(x, y) << "x must be less than y"       // x < y
+TVM_FFI_ICHECK_LE(x, y) << "x must be at most y"         // x <= y
+TVM_FFI_ICHECK_GT(x, y) << "x must be greater than y"    // x > y
+TVM_FFI_ICHECK_GE(x, y) << "x must be at least y"        // x >= y
+TVM_FFI_ICHECK_NOTNULL(ptr) << "ptr must not be null"    // ptr != nullptr
+
+// Custom error type check
+TVM_FFI_CHECK(condition, ErrorKind) << "message"
+```
+
+## Environment Error Handling
+
+```cpp
+// Check for interrupts (e.g., Python Ctrl+C)
+int TVMFFIEnvCheckSignals()
+
+// Exception for pre-existing errors in environment
+throw tvm::ffi::EnvErrorAlreadySet();
+
+// Usage in long-running functions
+void LongRunningFunction() {
+  if (TVMFFIEnvCheckSignals() != 0) {
+    throw ::tvm::ffi::EnvErrorAlreadySet();
+  }
+  // ... do work
+}
+```
+
+## Safe Call Wrappers (for C API)
+
+```cpp
+// Wrap C++ code for C API
+TVM_FFI_SAFE_CALL_BEGIN();
+// C++ code that may throw
+TVM_FFI_SAFE_CALL_END();
+
+// Check C function return codes
+TVM_FFI_CHECK_SAFE_CALL(function_call);
+```
+
+**Usage Examples:**
+```cpp
+// User input validation
+void MyKernel(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
+  if (x.ndim() != 2) {
+    TVM_FFI_THROW(ValueError) << "Expected 2D tensor, got " << x.ndim() << "D";
+  }
+  
+  DLDataType dt = x.dtype();
+  if (dt.code != kDLFloat || dt.bits != 32) {
+    TVM_FFI_THROW(TypeError) << "Expected float32, got " << DLDataTypeToString(dt);
+  }
+  
+  // Internal consistency checks
+  TVM_FFI_ICHECK_EQ(x.size(1), y.size(0)) << "Dimension mismatch for matmul";
+}
+```
+
+---
+
+# Part 4: Function Export (tvm/ffi/function.h)
+
+## Export Macro
+
+```cpp
+// Export typed C++ function for FFI
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(export_name, cpp_function)
+
+// Example:
+void MyKernel(tvm::ffi::TensorView a, tvm::ffi::TensorView b) { 
+  // kernel implementation
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(my_kernel, MyKernel);
+
+// Or with lambda
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one, [](int x) { return x + 1; });
+```
+
+## Supported Function Signatures
+
+The macro supports automatic type marshalling for:
+
+**Primitive Types:**
+- `int32_t`, `int64_t`
+- `uint32_t`, `uint64_t`
+- `float`, `double`
+- `bool`
+
+**FFI Types:**
+- `tvm::ffi::TensorView`, `tvm::ffi::Tensor`
+- `tvm::ffi::String`, `std::string`, `const char*`
+- `tvm::ffi::Array<T>`, `tvm::ffi::Map<K,V>`, `tvm::ffi::Tuple<...>`
+- `tvm::ffi::Shape`, `tvm::ffi::ShapeView`
+- `tvm::ffi::Any`, `tvm::ffi::Optional<T>`
+- `DLDataType`, `DLDevice`
+
+**Return Types:**
+- `void` (no return)
+- Any of the above types
+
+**Example Signatures:**
+```cpp
+void func1(TensorView t1, TensorView t2)
+int64_t func2(TensorView t)
+Tensor func3(TensorView t, int64_t size, float alpha)
+Array<int64_t> func4(Shape shape, String name)
+Tuple<int64_t, float> func5(TensorView t)
+```
+
+## Function Class (for dynamic usage)
+
+```cpp
+class Function {
+public:
+  // Get global function by name
+  static std::optional<Function> GetGlobal(std::string_view name);
+  static Function GetGlobalRequired(std::string_view name);  // Throws if not found
+  
+  // Set global function
+  static void SetGlobal(std::string_view name, Function func, bool override = false);
+  
+  // List all global function names
+  static std::vector<String> ListGlobalNames();
+  
+  // Remove global function
+  static void RemoveGlobal(const String& name);
+  
+  // Call function with arguments
+  template<typename... Args>
+  Any operator()(Args&&... args) const;
+  
+  // Create from typed function
+  template<typename TCallable>
+  static Function FromTyped(TCallable callable);
+  
+  template<typename TCallable>
+  static Function FromTyped(TCallable callable, std::string name);
+};
+```
+
+**Usage Example:**
+```cpp
+// Get and call existing function
+auto opt_func = Function::GetGlobal("my.function.name");
+if (opt_func.has_value()) {
+  Any result = (*opt_func)(arg1, arg2);
+}
+
+// Create and register function
+Function func = Function::FromTyped([](int64_t x) { return x * 2; });
+Function::SetGlobal("double", func);
+```
+
+## TypedFunction<R(Args...)>
+
+```cpp
+// Type-safe function wrapper
+TypedFunction<int(int, int)> add_func = [](int a, int b) { return a + b; };
+
+// Call with type checking
+int result = add_func(1, 2);
+
+// Convert to/from Function
+Function erased = add_func;
+TypedFunction<int(int, int)> typed = TypedFunction<int(int, int)>(erased);
+```
+
+---
+
+# Part 5: Module Loading (tvm/ffi/extra/module.h)
+
+**Purpose:** Load and use functions from other compiled modules (for importing helper kernels).
+
+## Module Class
+
+```cpp
+class Module {
+public:
+  // Load module from file
+  static Module LoadFromFile(const String& file_name);
+  
+  // Get function from module
+  Optional<Function> GetFunction(const String& name, bool query_imports = true);
+  
+  // Check if function exists
+  bool ImplementsFunction(const String& name, bool query_imports = true);
+  
+  // Get function metadata
+  Optional<String> GetFunctionDoc(const String& name, bool query_imports = true);
+  Optional<String> GetFunctionMetadata(const String& name, bool query_imports = true);
+  
+  // Import another module
+  void ImportModule(const Module& other);
+  
+  // Export module
+  void WriteToFile(const String& file_name, const String& format) const;
+  Bytes SaveToBytes() const;
+  String InspectSource(const String& format) const;
+  Array<String> GetWriteFormats() const;
+};
+```
+
+## Module Properties
+
+```cpp
+enum ModulePropertyMask {
+  kBinarySerializable = 0b001,      // Can be serialized to bytes
+  kRunnable = 0b010,                // Has runnable functions
+  kCompilationExportable = 0b100    // Can export to .o/.cc/.cu
+};
+```
+
+## Symbol Names
+
+```cpp
+namespace symbol {
+  constexpr const char* tvm_ffi_symbol_prefix = "__tvm_ffi_";
+  constexpr const char* tvm_ffi_main = "__tvm_ffi_main";
+  constexpr const char* tvm_ffi_library_ctx = "__tvm_ffi__library_ctx";
+  constexpr const char* tvm_ffi_library_bin = "__tvm_ffi__library_bin";
+  constexpr const char* tvm_ffi_metadata_prefix = "__tvm_ffi__metadata_";
+}
+```
+
+**Usage Example:**
+```cpp
+// Load helper module
+Module helpers = Module::LoadFromFile("helpers.so");
+
+// Get function from module
+Optional<Function> opt_func = helpers->GetFunction("helper_kernel");
+if (opt_func.defined()) {
+  Function helper = opt_func.value();
+  // Call helper function
+  helper(tensor1, tensor2);
+}
+```
+
+---
+
+# Part 6: Environment APIs (tvm/ffi/extra/c_env_api.h)
+
+## CUDA Stream Management
+
+```cpp
+// Get current CUDA stream for device
+TVMFFIStreamHandle TVMFFIEnvGetStream(int32_t device_type, int32_t device_id)
+
+// Set current CUDA stream for device
+int TVMFFIEnvSetStream(int32_t device_type, 
+                       int32_t device_id,
+                       TVMFFIStreamHandle stream,
+                       TVMFFIStreamHandle* opt_out_original_stream)
+
+// Usage in kernel
+DLDevice dev = tensor.device();
+cudaStream_t stream = static_cast<cudaStream_t>(
+    TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+
+// Launch kernel on stream
+my_kernel<<<blocks, threads, 0, stream>>>(...);
+```
+
+## Tensor Allocation
+
+```cpp
+// Allocate tensor using environment allocator (respects TLS/global allocator)
+int TVMFFIEnvTensorAlloc(DLTensor* prototype, TVMFFIObjectHandle* out)
+
+// Get/Set allocator
+DLPackManagedTensorAllocator TVMFFIEnvGetDLPackManagedTensorAllocator()
+
+int TVMFFIEnvSetDLPackManagedTensorAllocator(
+    DLPackManagedTensorAllocator allocator,
+    int write_to_global_context,
+    DLPackManagedTensorAllocator* opt_out_original_allocator)
+
+// Usage: Allocate output tensor
+ffi::Tensor output = ffi::Tensor::FromEnvAlloc(
+    TVMFFIEnvTensorAlloc, 
+    shape,   // ffi::ShapeView
+    dtype,   // DLDataType
+    device   // DLDevice
+);
+```
+
+## Module Symbol Management
+
+```cpp
+// Lookup function from module imports
+int TVMFFIEnvModLookupFromImports(TVMFFIObjectHandle library_ctx,
+                                  const char* func_name,
+                                  TVMFFIObjectHandle* out)
+
+// Register context symbols (available when library is loaded)
+int TVMFFIEnvModRegisterContextSymbol(const char* name, void* symbol)
+
+// Register system library symbols
+int TVMFFIEnvModRegisterSystemLibSymbol(const char* name, void* symbol)
+
+// Register C API symbols
+int TVMFFIEnvRegisterCAPI(const char* name, void* symbol)
+```
+
+## Signal Checking
+
+```cpp
+// Check for environment signals (e.g., Python Ctrl+C interrupts)
+int TVMFFIEnvCheckSignals()
+
+// Usage in long-running loops
+for (int i = 0; i < iterations; ++i) {
+  if (TVMFFIEnvCheckSignals() != 0) {
+    throw tvm::ffi::EnvErrorAlreadySet();
+  }
+  // ... do work
+}
+```
+
+---
+
+# Common Patterns and Examples
+
+## Pattern 1: Basic CUDA Kernel
+
+```cpp
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/error.h>
+
+__global__ void AddKernel(float* a, float* b, float* c, int64_t n) {
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    c[idx] = a[idx] + b[idx];
+  }
+}
+
+void Add(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::TensorView c) {
+  // Validate inputs
+  TVM_FFI_ICHECK_EQ(a.ndim(), 1) << "a must be 1D";
+  TVM_FFI_ICHECK_EQ(b.ndim(), 1) << "b must be 1D";
+  TVM_FFI_ICHECK_EQ(c.ndim(), 1) << "c must be 1D";
+  TVM_FFI_ICHECK_EQ(a.size(0), b.size(0)) << "Shape mismatch";
+  TVM_FFI_ICHECK_EQ(a.size(0), c.size(0)) << "Shape mismatch";
+  
+  // Check dtype
+  if (a.dtype().code != kDLFloat || a.dtype().bits != 32) {
+    TVM_FFI_THROW(TypeError) << "Expected float32 tensor";
+  }
+  
+  // Get data
+  float* a_data = static_cast<float*>(a.data_ptr());
+  float* b_data = static_cast<float*>(b.data_ptr());
+  float* c_data = static_cast<float*>(c.data_ptr());
+  int64_t n = a.size(0);
+  
+  // Get CUDA stream
+  DLDevice dev = a.device();
+  cudaStream_t stream = static_cast<cudaStream_t>(
+      TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+  
+  // Launch kernel
+  int64_t threads = 256;
+  int64_t blocks = (n + threads - 1) / threads;
+  AddKernel<<<blocks, threads, 0, stream>>>(a_data, b_data, c_data, n);
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(add, Add);
+```
+
+## Pattern 2: Multi-Dimensional Tensor Processing
+
+```cpp
+void Process2D(tvm::ffi::TensorView input, tvm::ffi::TensorView output) {
+  // Validate
+  TVM_FFI_ICHECK_EQ(input.ndim(), 2) << "Expected 2D tensor";
+  TVM_FFI_ICHECK_EQ(output.ndim(), 2) << "Expected 2D tensor";
+  
+  // Get dimensions
+  int64_t height = input.size(0);
+  int64_t width = input.size(1);
+  TVM_FFI_ICHECK_EQ(output.size(0), height);
+  TVM_FFI_ICHECK_EQ(output.size(1), width);
+  
+  // Check contiguous
+  TVM_FFI_ICHECK(input.IsContiguous()) << "Input must be contiguous";
+  
+  // Access with strides (more general)
+  int64_t stride_h = input.stride(0);
+  int64_t stride_w = input.stride(1);
+  
+  // For contiguous tensors: element[i][j] at data[i * stride_h + j * stride_w]
+}
+```
+
+## Pattern 3: Allocating Output Tensors
+
+```cpp
+tvm::ffi::Tensor AllocateOutput(tvm::ffi::TensorView input, 
+                                tvm::ffi::ShapeView output_shape) {
+  // Create output with same device and dtype as input
+  DLDevice device = input.device();
+  DLDataType dtype = input.dtype();
+  
+  // Allocate using environment allocator
+  tvm::ffi::Tensor output = tvm::ffi::Tensor::FromEnvAlloc(
+      TVMFFIEnvTensorAlloc,
+      output_shape,
+      dtype,
+      device
+  );
+  
+  return output;
+}
+
+// Usage
+void MyKernel(tvm::ffi::TensorView input) {
+  // Create output shape
+  Shape out_shape = {input.size(0), input.size(1) * 2};
+  
+  // Allocate
+  Tensor output = AllocateOutput(input, out_shape);
+  
+  // Now use output.data_ptr() in kernel
+}
+```
+
+## Pattern 4: Device-Specific Dispatch
+
+```cpp
+void UniversalKernel(tvm::ffi::TensorView input, tvm::ffi::TensorView output) {
+  DLDevice dev = input.device();
+  
+  switch (dev.device_type) {
+    case kDLCUDA: {
+      // CUDA implementation
+      cudaStream_t stream = static_cast<cudaStream_t>(
+          TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+      // Launch CUDA kernel
+      break;
+    }
+    case kDLCPU: {
+      // CPU implementation
+      float* in_data = static_cast<float*>(input.data_ptr());
+      float* out_data = static_cast<float*>(output.data_ptr());
+      // ... CPU code
+      break;
+    }
+    case kDLROCM: {
+      // ROCm/HIP implementation
+      break;
+    }
+    default:
+      TVM_FFI_THROW(RuntimeError) << "Unsupported device type: " << dev.device_type;
+  }
+}
+```
+
+## Pattern 5: Using Configuration Parameters
+
+```cpp
+void ConfigurableKernel(tvm::ffi::TensorView input,
+                       tvm::ffi::TensorView output,
+                       tvm::ffi::Map<tvm::ffi::String, int64_t> config) {
+  // Extract config with defaults
+  int64_t block_size = config.Get("block_size", 256);
+  int64_t num_threads = config.Get("num_threads", 1024);
+  
+  // Or check if key exists
+  if (config.count("custom_param") > 0) {
+    int64_t custom = config.at("custom_param");
+    // use custom
+  }
+  
+  // Launch with config
+  // ...
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(configurable_kernel, ConfigurableKernel);
+```
+
+## Pattern 6: Returning Multiple Values
+
+```cpp
+// Return tuple
+tvm::ffi::Tuple<tvm::ffi::Tensor, int64_t, float> 
+ComputeWithStats(tvm::ffi::TensorView input) {
+  // Allocate output
+  tvm::ffi::Tensor output = AllocateOutput(input, input.shape());
+  
+  // Compute statistics
+  int64_t count = input.numel();
+  float mean = 0.0f;  // ... compute mean
+  
+  // Return multiple values
+  return {output, count, mean};
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(compute_with_stats, ComputeWithStats);
+```
+
+## Pattern 7: Using Arrays for Variable Arguments
+
+```cpp
+void BatchProcess(tvm::ffi::Array<tvm::ffi::TensorView> inputs,
+                 tvm::ffi::Array<tvm::ffi::TensorView> outputs) {
+  TVM_FFI_ICHECK_EQ(inputs.size(), outputs.size()) << "Size mismatch";
+  
+  for (size_t i = 0; i < inputs.size(); ++i) {
+    TensorView in = inputs[i];
+    TensorView out = outputs[i];
+    // Process each pair
+  }
+}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(batch_process, BatchProcess);
+```
+
+---
+
+# Quick Reference Summary
+
+## Most Common Imports
+
+```cpp
+#include <tvm/ffi/container/tensor.h>   // TensorView, Tensor
+#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
+#include <tvm/ffi/error.h>              // TVM_FFI_THROW, TVM_FFI_ICHECK
+#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream
+```
+
+## Most Common Function Signature
+
+```cpp
+void MyKernel(tvm::ffi::TensorView input, tvm::ffi::TensorView output) {
+  // 1. Validate inputs
+  // 2. Get CUDA stream
+  // 3. Launch kernel
+}
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(my_kernel, MyKernel);
+```
+
+## Most Common Validation Pattern
+
+```cpp
+// Shape
+TVM_FFI_ICHECK_EQ(tensor.ndim(), 2);
+TVM_FFI_ICHECK_EQ(tensor.size(0), expected_dim0);
+
+// Dtype
+DLDataType dt = tensor.dtype();
+TVM_FFI_ICHECK_EQ(dt.code, kDLFloat);
+TVM_FFI_ICHECK_EQ(dt.bits, 32);
+
+// Device
+TVM_FFI_ICHECK_EQ(tensor.device().device_type, kDLCUDA);
+
+// Memory layout
+TVM_FFI_ICHECK(tensor.IsContiguous());
+```
+
+## Most Common CUDA Launch Pattern
+
+```cpp
+DLDevice dev = tensor.device();
+cudaStream_t stream = static_cast<cudaStream_t>(
+    TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+    
+float* data = static_cast<float*>(tensor.data_ptr());
+int64_t n = tensor.numel();
+
+int threads = 256;
+int blocks = (n + threads - 1) / threads;
+MyKernel<<<blocks, threads, 0, stream>>>(data, n);
+```
+
+---
+
+This completes the comprehensive TVM FFI API reference. Use this as your guide for writing FFI-compatible CUDA kernels and host functions.
+"""
+
+FFI_PROMPT = """
+You should use TVM FFI bindings for your code.
+
+# Required Headers
+
+```cpp
+#include <tvm/ffi/container/tensor.h>   // Tensor, TensorView
+#include <tvm/ffi/function.h>           // Function export macros
+#include <tvm/ffi/error.h>              // Error handling
+#include <tvm/ffi/extra/c_env_api.h>    // Environment APIs (streams, allocators)
+```
+
+# Complete Example: CUDA Kernel Binding
+
+```cpp
+// File: add_one_cuda.cu
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/error.h>
+
+namespace my_kernels {
+
+__global__ void AddOneKernel(float* x, float* y, int n) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = x[idx] + 1.0f;
+  }
+}
+
+void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
+  // Input validation
+  TVM_FFI_ICHECK_EQ(x.ndim(), 1) << "x must be 1D";
+  TVM_FFI_ICHECK_EQ(y.ndim(), 1) << "y must be 1D";
+  TVM_FFI_ICHECK_EQ(x.size(0), y.size(0)) << "Shape mismatch";
+
+  // Get data pointers
+  float* x_data = static_cast<float*>(x.data_ptr());
+  float* y_data = static_cast<float*>(y.data_ptr());
+  int64_t n = x.size(0);
+
+  // Get CUDA stream from environment
+  DLDevice dev = x.device();
+  cudaStream_t stream = static_cast<cudaStream_t>(
+      TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+
+  // Launch kernel
+  int64_t threads = 256;
+  int64_t blocks = (n + threads - 1) / threads;
+  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
+}
+
+// Export the function with name "add_one_cuda"
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cuda, AddOne);
+
+}  // namespace my_kernels
+```
+
+# TVM FFI API Documentation
+
+## 1. Tensor Container API (tvm/ffi/container/tensor.h)
+
+### Tensor Class
+A managed n-dimensional array with reference counting.
+
+**Methods:**
+- `void* data_ptr() const` - Returns raw data pointer
+- `DLDevice device() const` - Returns device info (`.device_type`, `.device_id`)
+- `int32_t ndim() const` - Returns number of dimensions
+- `DLDataType dtype() const` - Returns data type (`.code`, `.bits`, `.lanes`)
+- `ShapeView shape() const` - Returns shape array (indexable: `shape()[0]`, `shape()[1]`, ...)
+- `ShapeView strides() const` - Returns strides array
+- `int64_t size(int64_t idx) const` - Returns size of dimension at idx (negative idx counts from end)
+- `int64_t stride(int64_t idx) const` - Returns stride of dimension at idx
+- `int64_t numel() const` - Returns total number of elements
+- `uint64_t byte_offset() const` - Returns byte offset
+- `bool IsContiguous() const` - Checks if tensor memory is contiguous
+- `bool IsAligned(size_t alignment) const` - Checks if data is aligned to given bytes
+
+**Static Factory Methods:**
+- `static Tensor FromDLPack(DLManagedTensor* tensor, size_t require_alignment = 0, bool require_contiguous = false)`
+- `static Tensor FromDLPackVersioned(DLManagedTensorVersioned* tensor, size_t require_alignment = 0, bool require_contiguous = false)`
+- `template<typename TNDAlloc, typename... ExtraArgs> static Tensor FromNDAlloc(TNDAlloc alloc, ffi::ShapeView shape, DLDataType dtype, DLDevice device, ExtraArgs&&... extra_args)`
+- `static Tensor FromEnvAlloc(int (*env_alloc)(DLTensor*, TVMFFIObjectHandle*), ffi::ShapeView shape, DLDataType dtype, DLDevice device)`
+
+**Conversion Methods:**
+- `DLManagedTensor* ToDLPack() const` - Convert to DLPack managed tensor
+- `DLManagedTensorVersioned* ToDLPackVersioned() const` - Convert to versioned DLPack
+- `const DLTensor* GetDLTensorPtr() const` - Get underlying DLTensor pointer
+
+### TensorView Class
+Non-owning lightweight view of a Tensor. Kernel entrypoints should use `tvm::ffi::TensorView` (or `const TensorView&`) for tensor inputs/outputs.
+
+**Constructors:**
+- `TensorView(const Tensor& tensor)` - From Tensor
+- `TensorView(const DLTensor* tensor)` - From DLTensor pointer
+
+**Methods (same interface as Tensor):**
+- `void* data_ptr() const`
+- `DLDevice device() const`
+- `int32_t ndim() const`
+- `DLDataType dtype() const`
+- `ShapeView shape() const`
+- `ShapeView strides() const`
+- `int64_t size(int64_t idx) const`
+- `int64_t stride(int64_t idx) const`
+- `int64_t numel() const`
+- `uint64_t byte_offset() const`
+- `bool IsContiguous() const`
+
+### Utility Functions
+- `bool IsContiguous(const DLTensor& arr)` - Check if DLTensor is contiguous
+- `bool IsAligned(const DLTensor& arr, size_t alignment)` - Check alignment
+- `bool IsDirectAddressDevice(const DLDevice& device)` - Check if device uses direct addressing
+- `size_t GetDataSize(size_t numel, DLDataType dtype)` - Calculate bytes for packed data
+- `size_t GetDataSize(const DLTensor& arr)` - Calculate bytes in DLTensor
+- `size_t GetDataSize(const Tensor& tensor)` - Calculate bytes in Tensor
+- `size_t GetDataSize(const TensorView& tensor)` - Calculate bytes in TensorView
+
+### Device Type Constants (DLDevice.device_type)
+```cpp
+kDLCPU = 1          // CPU
+kDLCUDA = 2         // CUDA GPU
+kDLCUDAHost = 3     // CUDA pinned memory
+kDLROCM = 10        // ROCm/HIP
+```
+
+### Data Type Constants (DLDataType)
+```cpp
+// DLDataType has: uint8_t code, uint8_t bits, uint16_t lanes
+kDLInt = 0          // Signed integer
+kDLUInt = 1         // Unsigned integer
+kDLFloat = 2        // IEEE floating point
+kDLBfloat = 4       // Brain floating point
+
+// Example: float32 has code=2 (kDLFloat), bits=32, lanes=1
+// Example: half/fp16 has code=2, bits=16, lanes=1
+```
+
+## 2. Function API (tvm/ffi/function.h)
+
+### Function Class
+Type-erased callable object.
+
+**Constructors:**
+- `Function(std::nullptr_t)` - Null constructor
+- `template<typename TCallable> Function(TCallable packed_call)` - From callable (legacy)
+
+**Static Factory Methods:**
+- `template<typename TCallable> static Function FromPacked(TCallable packed_call)` - From packed signature: `void(const AnyView*, int32_t, Any*)` or `void(PackedArgs, Any*)`
+- `template<typename TCallable> static Function FromTyped(TCallable callable)` - From typed C++ function
+- `template<typename TCallable> static Function FromTyped(TCallable callable, std::string name)` - With name for error messages
+- `static Function FromExternC(void* self, TVMFFISafeCallType safe_call, void (*deleter)(void* self))` - From C callback
+
+**Global Registry:**
+- `static std::optional<Function> GetGlobal(std::string_view name)`
+- `static std::optional<Function> GetGlobal(const std::string& name)`
+- `static std::optional<Function> GetGlobal(const String& name)`
+- `static std::optional<Function> GetGlobal(const char* name)`
+- `static Function GetGlobalRequired(std::string_view name)` - Throws if not found
+- `static Function GetGlobalRequired(const std::string& name)`
+- `static Function GetGlobalRequired(const String& name)`
+- `static Function GetGlobalRequired(const char* name)`
+- `static void SetGlobal(std::string_view name, Function func, bool override = false)`
+- `static std::vector<String> ListGlobalNames()`
+- `static void RemoveGlobal(const String& name)`
+
+**Invocation:**
+- `template<typename... Args> Any operator()(Args&&... args) const` - Call with unpacked args
+- `void CallPacked(const AnyView* args, int32_t num_args, Any* result) const`
+- `void CallPacked(PackedArgs args, Any* result) const`
+- `template<typename... Args> static Any InvokeExternC(void* handle, TVMFFISafeCallType safe_call, Args&&... args)`
+
+### TypedFunction<R(Args...)> Class
+Type-safe wrapper around Function.
+
+**Constructors:**
+- `TypedFunction()` - Default
+- `TypedFunction(std::nullptr_t)`
+- `TypedFunction(Function packed)` - From Function
+- `template<typename FLambda> TypedFunction(FLambda typed_lambda)` - From lambda
+- `template<typename FLambda> TypedFunction(FLambda typed_lambda, std::string name)` - With name
+
+**Methods:**
+- `R operator()(Args... args) const` - Type-safe invocation
+- `operator Function() const` - Convert to Function
+- `const Function& packed() const&` - Get internal Function
+- `Function&& packed() &&` - Move internal Function
+- `static std::string TypeSchema()` - Get JSON type schema
+
+### PackedArgs Class
+Represents packed arguments.
+
+**Constructor:**
+- `PackedArgs(const AnyView* data, int32_t size)`
+
+**Methods:**
+- `int size() const` - Number of arguments
+- `const AnyView* data() const` - Raw argument array
+- `PackedArgs Slice(int begin, int end = -1) const` - Get subset
+- `AnyView operator[](int i) const` - Access argument
+- `template<typename... Args> static void Fill(AnyView* data, Args&&... args)` - Pack arguments
+
+### Export Macro
+```cpp
+// Export typed C++ function for FFI
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(export_name, cpp_function)
+
+// Example:
+void MyKernel(tvm::ffi::TensorView a, tvm::ffi::TensorView b) { ... }
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(my_kernel, MyKernel);
+```
+
+**Supported function signatures:**
+- `void func(TensorView t1, TensorView t2, ...)`
+- `void func(TensorView t, int64_t size, float alpha, ...)`
+- `int64_t func(TensorView t)`
+- Any combination of: `TensorView`, `int32_t`, `int64_t`, `float`, `double`, `bool`, `std::string`
+
+## 3. Error Handling API (tvm/ffi/error.h)
+
+### Error Class
+Exception object with stack trace.
+
+**Constructor:**
+- `Error(std::string kind, std::string message, std::string backtrace)`
+- `Error(std::string kind, std::string message, const TVMFFIByteArray* backtrace)`
+
+**Methods:**
+- `std::string kind() const` - Error category
+- `std::string message() const` - Error description
+- `std::string backtrace() const` - Stack trace
+- `std::string TracebackMostRecentCallLast() const` - Python-style traceback
+- `const char* what() const noexcept override` - Standard exception interface
+- `void UpdateBacktrace(const TVMFFIByteArray* backtrace_str, int32_t update_mode)` - Modify traceback
+
+### EnvErrorAlreadySet Exception
+Thrown when error exists in frontend environment (e.g., Python interrupt).
+
+**Usage:**
+```cpp
+void LongRunningFunction() {
+  if (TVMFFIEnvCheckSignals() != 0) {
+    throw ::tvm::ffi::EnvErrorAlreadySet();
+  }
+  // do work here
+}
+```
+
+### Error Macros
+```cpp
+// Throw with backtrace
+TVM_FFI_THROW(ErrorKind) << message
+
+// Log to stderr and throw (for startup functions)
+TVM_FFI_LOG_AND_THROW(ErrorKind) << message
+
+// Check C function return code
+TVM_FFI_CHECK_SAFE_CALL(func)
+
+// Wrap C++ code for C API
+TVM_FFI_SAFE_CALL_BEGIN();
+// c++ code region here
+TVM_FFI_SAFE_CALL_END();
+```
+
+### Assertion Macros
+```cpp
+TVM_FFI_ICHECK(condition) << "message"           // General check
+TVM_FFI_CHECK(condition, ErrorKind) << "message" // Custom error type
+TVM_FFI_ICHECK_EQ(x, y) << "x must equal y"      // x == y
+TVM_FFI_ICHECK_NE(x, y) << "x must not equal y"  // x != y
+TVM_FFI_ICHECK_LT(x, y) << "x must be less than y"     // x < y
+TVM_FFI_ICHECK_LE(x, y) << "x must be at most y"       // x <= y
+TVM_FFI_ICHECK_GT(x, y) << "x must be greater than y"  // x > y
+TVM_FFI_ICHECK_GE(x, y) << "x must be at least y"      // x >= y
+TVM_FFI_ICHECK_NOTNULL(ptr) << "ptr must not be null"  // ptr != nullptr
+```
+
+**Common Error Kinds:**
+- `ValueError` - Invalid argument values
+- `TypeError` - Type mismatch
+- `RuntimeError` - Runtime failures (CUDA errors, etc.)
+- `InternalError` - Internal logic errors
+
+### Utility Functions
+- `int32_t TypeKeyToIndex(std::string_view type_key)` - Get type index from key
+
+## 4. Environment API (tvm/ffi/extra/c_env_api.h)
+
+### Stream Management
+```cpp
+// Set current stream for a device
+int TVMFFIEnvSetStream(int32_t device_type, int32_t device_id,
+                       TVMFFIStreamHandle stream,
+                       TVMFFIStreamHandle* opt_out_original_stream)
+
+// Get current stream for a device
+TVMFFIStreamHandle TVMFFIEnvGetStream(int32_t device_type, int32_t device_id)
+
+// Usage example:
+DLDevice dev = tensor.device();
+cudaStream_t stream = static_cast<cudaStream_t>(
+    TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+```
+
+### Tensor Allocation
+```cpp
+// Set DLPack allocator (TLS or global)
+int TVMFFIEnvSetDLPackManagedTensorAllocator(
+    DLPackManagedTensorAllocator allocator,
+    int write_to_global_context,
+    DLPackManagedTensorAllocator* opt_out_original_allocator)
+
+// Get current allocator
+DLPackManagedTensorAllocator TVMFFIEnvGetDLPackManagedTensorAllocator()
+
+// Allocate tensor using environment allocator
+int TVMFFIEnvTensorAlloc(DLTensor* prototype, TVMFFIObjectHandle* out)
+
+// Usage with FromEnvAlloc:
+ffi::Tensor tensor = ffi::Tensor::FromEnvAlloc(
+    TVMFFIEnvTensorAlloc, shape, dtype, device);
+```
+
+### Module & Symbol Management
+```cpp
+// Lookup function from module imports
+int TVMFFIEnvModLookupFromImports(TVMFFIObjectHandle library_ctx,
+                                  const char* func_name,
+                                  TVMFFIObjectHandle* out)
+
+// Register context symbol (available when library is loaded)
+int TVMFFIEnvModRegisterContextSymbol(const char* name, void* symbol)
+
+// Register system library symbol
+int TVMFFIEnvModRegisterSystemLibSymbol(const char* name, void* symbol)
+
+// Register C API symbol
+int TVMFFIEnvRegisterCAPI(const char* name, void* symbol)
+```
+
+### Utilities
+```cpp
+// Check for environment signals (e.g., Python Ctrl+C)
+int TVMFFIEnvCheckSignals()
+```
+
+### Type Definitions
+```cpp
+typedef void* TVMFFIStreamHandle  // Stream handle type
+```
+
+# Common Patterns
+
+## Dtype Validation
+```cpp
+void MyKernel(tvm::ffi::TensorView x) {
+  DLDataType dt = x.dtype();
+  TVM_FFI_ICHECK_EQ(dt.code, kDLFloat) << "Expected float dtype";
+  TVM_FFI_ICHECK_EQ(dt.bits, 32) << "Expected 32-bit dtype";
+
+  // Or for user-facing errors:
+  if (dt.code != kDLFloat || dt.bits != 32) {
+    TVM_FFI_THROW(TypeError) << "Expected float32, got "
+                             << "code=" << dt.code << " bits=" << dt.bits;
+  }
+}
+```
+
+## Shape Validation
+```cpp
+void MatMul(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::TensorView c) {
+  TVM_FFI_ICHECK_EQ(a.ndim(), 2) << "a must be 2D";
+  TVM_FFI_ICHECK_EQ(b.ndim(), 2) << "b must be 2D";
+  TVM_FFI_ICHECK_EQ(a.size(1), b.size(0)) << "Shape mismatch for matmul";
+  TVM_FFI_ICHECK_EQ(c.size(0), a.size(0)) << "Output shape mismatch";
+  TVM_FFI_ICHECK_EQ(c.size(1), b.size(1)) << "Output shape mismatch";
+}
+```
+
+## Multi-dimensional Indexing
+```cpp
+void Process2D(tvm::ffi::TensorView x) {
+  int64_t height = x.size(0);
+  int64_t width = x.size(1);
+  float* data = static_cast<float*>(x.data_ptr());
+
+  // If contiguous (row-major), access as: data[i * width + j]
+  TVM_FFI_ICHECK(x.IsContiguous()) << "Expected contiguous tensor";
+
+  // With strides:
+  int64_t stride0 = x.stride(0);
+  int64_t stride1 = x.stride(1);
+  // Access: data[i * stride0 + j * stride1]
+}
+```
+
+## Device-specific Kernels
+```cpp
+void MyKernel(tvm::ffi::TensorView x) {
+  DLDevice dev = x.device();
+  if (dev.device_type == kDLCUDA) {
+    // Launch CUDA kernel
+    cudaStream_t stream = static_cast<cudaStream_t>(
+        TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+    // kernel<<<blocks, threads, 0, stream>>>(...);
+  } else if (dev.device_type == kDLCPU) {
+    // CPU implementation
+  } else {
+    TVM_FFI_THROW(RuntimeError) << "Unsupported device type: " << dev.device_type;
+  }
+}
+```
+
+## Allocating New Tensors
+```cpp
+void CreateOutput(tvm::ffi::TensorView input) {
+  // Create output tensor with same device and dtype as input
+  ffi::ShapeView shape = input.shape();
+  DLDataType dtype = input.dtype();
+  DLDevice device = input.device();
+
+  // Use environment allocator
+  ffi::Tensor output = ffi::Tensor::FromEnvAlloc(
+      TVMFFIEnvTensorAlloc, shape, dtype, device);
+}
+```
+"""
+
diff --git a/flashinfer_bench/ffi/load_inline.py b/flashinfer_bench/agents/load_inline.py
similarity index 100%
rename from flashinfer_bench/ffi/load_inline.py
rename to flashinfer_bench/agents/load_inline.py
diff --git a/flashinfer_bench/ffi/test_existing_prompt_sol.py b/flashinfer_bench/agents/test_existing_prompt_sol.py
similarity index 100%
rename from flashinfer_bench/ffi/test_existing_prompt_sol.py
rename to flashinfer_bench/agents/test_existing_prompt_sol.py
diff --git a/flashinfer_bench/ffi/test_prompt.py b/flashinfer_bench/agents/test_prompt.py
similarity index 100%
rename from flashinfer_bench/ffi/test_prompt.py
rename to flashinfer_bench/agents/test_prompt.py
diff --git a/flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt b/flashinfer_bench/agents/test_results/claude-opus-4-1-20250805.txt
similarity index 100%
rename from flashinfer_bench/ffi/test_results/claude-opus-4-1-20250805.txt
rename to flashinfer_bench/agents/test_results/claude-opus-4-1-20250805.txt
diff --git a/flashinfer_bench/ffi/test_results/gpt-5-2025-08-07.txt b/flashinfer_bench/agents/test_results/gpt-5-2025-08-07.txt
similarity index 100%
rename from flashinfer_bench/ffi/test_results/gpt-5-2025-08-07.txt
rename to flashinfer_bench/agents/test_results/gpt-5-2025-08-07.txt
diff --git a/flashinfer_bench/ffi/test_results/gpt-5-mini-2025-08-07.txt b/flashinfer_bench/agents/test_results/gpt-5-mini-2025-08-07.txt
similarity index 100%
rename from flashinfer_bench/ffi/test_results/gpt-5-mini-2025-08-07.txt
rename to flashinfer_bench/agents/test_results/gpt-5-mini-2025-08-07.txt
diff --git a/flashinfer_bench/ffi/test_results/o3.txt b/flashinfer_bench/agents/test_results/o3.txt
similarity index 100%
rename from flashinfer_bench/ffi/test_results/o3.txt
rename to flashinfer_bench/agents/test_results/o3.txt
diff --git a/flashinfer_bench/ffi/test_results/o4-mini-2025-04-16.txt b/flashinfer_bench/agents/test_results/o4-mini-2025-04-16.txt
similarity index 100%
rename from flashinfer_bench/ffi/test_results/o4-mini-2025-04-16.txt
rename to flashinfer_bench/agents/test_results/o4-mini-2025-04-16.txt
diff --git a/flashinfer_bench/agents/torch_load_inline.py b/flashinfer_bench/agents/torch_load_inline.py
new file mode 100644
index 0000000..5cb8c7c
--- /dev/null
+++ b/flashinfer_bench/agents/torch_load_inline.py
@@ -0,0 +1,87 @@
+import torch
+from torch.utils.cpp_extension import load_inline
+
+cuda_source = """
+#include <torch/extension.h>
+#include <cuda_runtime.h>
+#include <c10/cuda/CUDAStream.h>
+
+namespace torch_example_cuda {
+
+__global__ void AddOneKernel(float* x, float* y, int n) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < n) {
+    y[idx] = x[idx] + 1;
+  }
+}
+
+torch::Tensor add_one_cuda(torch::Tensor x) {
+  TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
+  TORCH_CHECK(x.is_contiguous(), "x must be contiguous");
+  TORCH_CHECK(x.dtype() == torch::kFloat32, "x must be float32");
+  
+  auto y = torch::empty_like(x);
+  
+  int64_t n = x.numel();
+  float* x_data = x.data_ptr<float>();
+  float* y_data = y.data_ptr<float>();
+  
+  int64_t threads = 256;
+  int64_t blocks = (n + threads - 1) / threads;
+  
+  cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
+  
+  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
+  
+  return y;
+}
+
+}  // namespace torch_example_cuda
+"""
+
+cpp_source = """
+#include <torch/extension.h>
+
+namespace torch_example_cuda {
+torch::Tensor add_one_cuda(torch::Tensor x);
+}
+
+torch::Tensor add_one(torch::Tensor x) {
+  return torch_example_cuda::add_one_cuda(x);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("add_one", &add_one, "Add one to each element (CUDA)");
+}
+"""
+
+
+def main():
+    print("Compiling CUDA extension with torch.utils.cpp_extension.load_inline...")
+    
+    module = load_inline(
+        name="add_one_cuda",
+        cpp_sources=cpp_source,
+        cuda_sources=cuda_source,
+        verbose=True,
+        with_cuda=True,
+    )
+    
+    print("Compilation successful!")
+    
+    # Test the compiled module
+    print("\nTesting the module...")
+    x = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], device="cuda")
+    print(f"Input:  {x}")
+    
+    y = module.add_one(x)
+    print(f"Output: {y}")
+    
+    expected = x + 1
+    print(f"Expected: {expected}")
+    print(f"Match: {torch.allclose(y, expected)}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/flashinfer_bench/ffi/ffi_prompt.py b/flashinfer_bench/ffi/ffi_prompt.py
deleted file mode 100644
index 59ee283..0000000
--- a/flashinfer_bench/ffi/ffi_prompt.py
+++ /dev/null
@@ -1,647 +0,0 @@
-FFI_PROMPT_SIMPLE = """
-Use TVM FFI format for your generated kernel host function and bindings
-
-Use the following headers:
-#include <tvm/ffi/container/tensor.h>   // TensorView: tensor arguments
-#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
-#include <tvm/ffi/error.h>              // TVM_FFI_ICHECK, TVM_FFI_THROW
-
-Include when using CUDA / env-managed streams or allocators:
-#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream, TVMFFIEnvTensorAlloc
-
-# TVM FFI API Reference
-
-## TensorView (tvm/ffi/container/tensor.h)
-
-**Purpose:** Non-owning view of tensor data. Use `tvm::ffi::TensorView` for function parameters.
-
-### Essential Methods
-```cpp
-void* data_ptr() const               // Raw data pointer
-DLDevice device() const              // Device info (.device_type, .device_id)
-DLDataType dtype() const             // Data type (.code, .bits, .lanes)
-int32_t ndim() const                 // Number of dimensions
-int64_t size(int64_t idx) const      // Size of dimension idx (negative idx: count from end)
-int64_t numel() const                // Total number of elements
-bool IsContiguous() const            // Check if memory is contiguous
-```
-
-### ShapeView Access
-```cpp
-ShapeView shape() const              // Get shape (array-like)
-ShapeView strides() const            // Get strides (array-like)
-
-// ShapeView can be indexed like an array:
-// tensor.shape()[0], tensor.shape()[1], etc.
-```
-
-### Device Type Constants (DLDevice.device_type)
-```cpp
-kDLCPU = 1          // CPU
-kDLCUDA = 2         // CUDA GPU
-kDLCUDAHost = 3     // CUDA pinned memory
-kDLROCM = 10        // ROCm/HIP
-```
-
-### Data Type Constants (DLDataType)
-```cpp
-// DLDataType has: uint8_t code, uint8_t bits, uint16_t lanes
-// Common .code values:
-kDLInt = 0          // Signed integer
-kDLUInt = 1         // Unsigned integer
-kDLFloat = 2        // IEEE floating point
-kDLBfloat = 4       // Brain floating point
-
-// Example: float32 has code=2 (kDLFloat), bits=32, lanes=1
-// Example: half/fp16 has code=2, bits=16, lanes=1
-```
-
-## Function Export (tvm/ffi/function.h)
-
-### Export Macro
-```cpp
-// Use this to export your C++ function for FFI
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(export_name, cpp_function)
-
-// Example:
-void MyKernel(tvm::ffi::TensorView a, tvm::ffi::TensorView b) { ... }
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(my_kernel, MyKernel);
-```
-
-**Supported function signatures:**
-- `void func(TensorView t1, TensorView t2, ...)`
-- `void func(TensorView t, int64_t size, float alpha, ...)`
-- `int64_t func(TensorView t)`
-- Any combination of: `TensorView`, `int32_t`, `int64_t`, `float`, `double`, `bool`, `std::string`
-
-## Error Handling (tvm/ffi/error.h)
-
-### Throwing Errors
-```cpp
-// Throw with custom error kind
-TVM_FFI_THROW(ValueError) << "Invalid input: " << x;
-TVM_FFI_THROW(RuntimeError) << "CUDA error: " << cudaGetErrorString(err);
-TVM_FFI_THROW(TypeError) << "Expected float32, got int32";
-```
-
-### Assertions (for internal logic errors)
-```cpp
-TVM_FFI_ICHECK(condition) << "message"           // General check
-TVM_FFI_ICHECK_EQ(x, y) << "x must equal y"      // x == y
-TVM_FFI_ICHECK_NE(x, y) << "x must not equal y"  // x != y
-TVM_FFI_ICHECK_LT(x, y) << "x must be less than y"     // x < y
-TVM_FFI_ICHECK_LE(x, y) << "x must be at most y"       // x <= y
-TVM_FFI_ICHECK_GT(x, y) << "x must be greater than y"  // x > y
-TVM_FFI_ICHECK_GE(x, y) << "x must be at least y"      // x >= y
-```
-
-### User Input Validation (use TVM_FFI_THROW instead)
-```cpp
-// For user-facing errors, use TVM_FFI_THROW with appropriate error kind:
-if (x.ndim() != 2) {
-  TVM_FFI_THROW(ValueError) << "Expected 2D tensor, got " << x.ndim() << "D";
-}
-if (x.dtype().code != kDLFloat || x.dtype().bits != 32) {
-  TVM_FFI_THROW(TypeError) << "Expected float32 dtype";
-}
-```
-
-## CUDA Stream Management (tvm/ffi/extra/c_env_api.h)
-
-```cpp
-// Get the current CUDA stream for a device
-TVMFFIStreamHandle TVMFFIEnvGetStream(int32_t device_type, int32_t device_id)
-
-// Usage:
-DLDevice dev = tensor.device();
-cudaStream_t stream = static_cast<cudaStream_t>(
-    TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-
-// Launch kernel on the stream:
-my_kernel<<<blocks, threads, 0, stream>>>(...);
-```
-
-# Common Patterns
-
-## Dtype Validation
-```cpp
-void MyKernel(tvm::ffi::TensorView x) {
-  DLDataType dt = x.dtype();
-  TVM_FFI_ICHECK_EQ(dt.code, kDLFloat) << "Expected float dtype";
-  TVM_FFI_ICHECK_EQ(dt.bits, 32) << "Expected 32-bit dtype";
-  // Or use TVM_FFI_THROW for user-facing validation
-}
-```
-
-## Shape Validation
-```cpp
-void MatMul(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::TensorView c) {
-  TVM_FFI_ICHECK_EQ(a.ndim(), 2) << "a must be 2D";
-  TVM_FFI_ICHECK_EQ(b.ndim(), 2) << "b must be 2D";
-  TVM_FFI_ICHECK_EQ(a.size(1), b.size(0)) << "Shape mismatch for matmul";
-  TVM_FFI_ICHECK_EQ(c.size(0), a.size(0)) << "Output shape mismatch";
-  TVM_FFI_ICHECK_EQ(c.size(1), b.size(1)) << "Output shape mismatch";
-}
-```
-
-## Multi-dimensional Indexing
-```cpp
-void Process2D(tvm::ffi::TensorView x) {
-  int64_t height = x.size(0);
-  int64_t width = x.size(1);
-  float* data = static_cast<float*>(x.data_ptr());
-
-  // If contiguous (row-major), access as: data[i * width + j]
-  TVM_FFI_ICHECK(x.IsContiguous()) << "Expected contiguous tensor";
-}
-```
-
-## Device-specific Kernels
-```cpp
-void MyKernel(tvm::ffi::TensorView x) {
-  DLDevice dev = x.device();
-  if (dev.device_type == kDLCUDA) {
-    // Launch CUDA kernel
-  } else if (dev.device_type == kDLCPU) {
-    // CPU implementation
-  } else {
-    TVM_FFI_THROW(RuntimeError) << "Unsupported device type: " << dev.device_type;
-  }
-}
-```
-
-Example Usage of FFI binding:
-```cpp
-// File: add_one_cuda.cu
-#include <tvm/ffi/container/tensor.h>
-#include <tvm/ffi/extra/c_env_api.h>
-#include <tvm/ffi/function.h>
-#include <tvm/ffi/error.h>
-
-namespace my_kernels {
-
-__global__ void AddOneKernel(float* x, float* y, int n) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    y[idx] = x[idx] + 1.0f;
-  }
-}
-
-void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
-  // Input validation
-  TVM_FFI_ICHECK_EQ(x.ndim(), 1) << "x must be 1D";
-  TVM_FFI_ICHECK_EQ(y.ndim(), 1) << "y must be 1D";
-  TVM_FFI_ICHECK_EQ(x.size(0), y.size(0)) << "Shape mismatch";
-
-  // Get data pointers
-  float* x_data = static_cast<float*>(x.data_ptr());
-  float* y_data = static_cast<float*>(y.data_ptr());
-  int64_t n = x.size(0);
-
-  // Get CUDA stream from environment
-  DLDevice dev = x.device();
-  cudaStream_t stream = static_cast<cudaStream_t>(
-      TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-
-  // Launch kernel
-  int64_t threads = 256;
-  int64_t blocks = (n + threads - 1) / threads;
-  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
-}
-
-// Export the function with name "add_one_cuda"
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cuda, AddOne);
-
-}  // namespace my_kernels
-```
-"""
-
-FFI_PROMPT = """
-You should use TVM FFI bindings for your code.
-
-# Required Headers
-
-```cpp
-#include <tvm/ffi/container/tensor.h>   // Tensor, TensorView
-#include <tvm/ffi/function.h>           // Function export macros
-#include <tvm/ffi/error.h>              // Error handling
-#include <tvm/ffi/extra/c_env_api.h>    // Environment APIs (streams, allocators)
-```
-
-# Complete Example: CUDA Kernel Binding
-
-```cpp
-// File: add_one_cuda.cu
-#include <tvm/ffi/container/tensor.h>
-#include <tvm/ffi/extra/c_env_api.h>
-#include <tvm/ffi/function.h>
-#include <tvm/ffi/error.h>
-
-namespace my_kernels {
-
-__global__ void AddOneKernel(float* x, float* y, int n) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    y[idx] = x[idx] + 1.0f;
-  }
-}
-
-void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
-  // Input validation
-  TVM_FFI_ICHECK_EQ(x.ndim(), 1) << "x must be 1D";
-  TVM_FFI_ICHECK_EQ(y.ndim(), 1) << "y must be 1D";
-  TVM_FFI_ICHECK_EQ(x.size(0), y.size(0)) << "Shape mismatch";
-
-  // Get data pointers
-  float* x_data = static_cast<float*>(x.data_ptr());
-  float* y_data = static_cast<float*>(y.data_ptr());
-  int64_t n = x.size(0);
-
-  // Get CUDA stream from environment
-  DLDevice dev = x.device();
-  cudaStream_t stream = static_cast<cudaStream_t>(
-      TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-
-  // Launch kernel
-  int64_t threads = 256;
-  int64_t blocks = (n + threads - 1) / threads;
-  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
-}
-
-// Export the function with name "add_one_cuda"
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cuda, AddOne);
-
-}  // namespace my_kernels
-```
-
-# TVM FFI API Documentation
-
-## 1. Tensor Container API (tvm/ffi/container/tensor.h)
-
-### Tensor Class
-A managed n-dimensional array with reference counting.
-
-**Methods:**
-- `void* data_ptr() const` - Returns raw data pointer
-- `DLDevice device() const` - Returns device info (`.device_type`, `.device_id`)
-- `int32_t ndim() const` - Returns number of dimensions
-- `DLDataType dtype() const` - Returns data type (`.code`, `.bits`, `.lanes`)
-- `ShapeView shape() const` - Returns shape array (indexable: `shape()[0]`, `shape()[1]`, ...)
-- `ShapeView strides() const` - Returns strides array
-- `int64_t size(int64_t idx) const` - Returns size of dimension at idx (negative idx counts from end)
-- `int64_t stride(int64_t idx) const` - Returns stride of dimension at idx
-- `int64_t numel() const` - Returns total number of elements
-- `uint64_t byte_offset() const` - Returns byte offset
-- `bool IsContiguous() const` - Checks if tensor memory is contiguous
-- `bool IsAligned(size_t alignment) const` - Checks if data is aligned to given bytes
-
-**Static Factory Methods:**
-- `static Tensor FromDLPack(DLManagedTensor* tensor, size_t require_alignment = 0, bool require_contiguous = false)`
-- `static Tensor FromDLPackVersioned(DLManagedTensorVersioned* tensor, size_t require_alignment = 0, bool require_contiguous = false)`
-- `template<typename TNDAlloc, typename... ExtraArgs> static Tensor FromNDAlloc(TNDAlloc alloc, ffi::ShapeView shape, DLDataType dtype, DLDevice device, ExtraArgs&&... extra_args)`
-- `static Tensor FromEnvAlloc(int (*env_alloc)(DLTensor*, TVMFFIObjectHandle*), ffi::ShapeView shape, DLDataType dtype, DLDevice device)`
-
-**Conversion Methods:**
-- `DLManagedTensor* ToDLPack() const` - Convert to DLPack managed tensor
-- `DLManagedTensorVersioned* ToDLPackVersioned() const` - Convert to versioned DLPack
-- `const DLTensor* GetDLTensorPtr() const` - Get underlying DLTensor pointer
-
-### TensorView Class
-Non-owning lightweight view of a Tensor. Kernel entrypoints should use `tvm::ffi::TensorView` (or `const TensorView&`) for tensor inputs/outputs.
-
-**Constructors:**
-- `TensorView(const Tensor& tensor)` - From Tensor
-- `TensorView(const DLTensor* tensor)` - From DLTensor pointer
-
-**Methods (same interface as Tensor):**
-- `void* data_ptr() const`
-- `DLDevice device() const`
-- `int32_t ndim() const`
-- `DLDataType dtype() const`
-- `ShapeView shape() const`
-- `ShapeView strides() const`
-- `int64_t size(int64_t idx) const`
-- `int64_t stride(int64_t idx) const`
-- `int64_t numel() const`
-- `uint64_t byte_offset() const`
-- `bool IsContiguous() const`
-
-### Utility Functions
-- `bool IsContiguous(const DLTensor& arr)` - Check if DLTensor is contiguous
-- `bool IsAligned(const DLTensor& arr, size_t alignment)` - Check alignment
-- `bool IsDirectAddressDevice(const DLDevice& device)` - Check if device uses direct addressing
-- `size_t GetDataSize(size_t numel, DLDataType dtype)` - Calculate bytes for packed data
-- `size_t GetDataSize(const DLTensor& arr)` - Calculate bytes in DLTensor
-- `size_t GetDataSize(const Tensor& tensor)` - Calculate bytes in Tensor
-- `size_t GetDataSize(const TensorView& tensor)` - Calculate bytes in TensorView
-
-### Device Type Constants (DLDevice.device_type)
-```cpp
-kDLCPU = 1          // CPU
-kDLCUDA = 2         // CUDA GPU
-kDLCUDAHost = 3     // CUDA pinned memory
-kDLROCM = 10        // ROCm/HIP
-```
-
-### Data Type Constants (DLDataType)
-```cpp
-// DLDataType has: uint8_t code, uint8_t bits, uint16_t lanes
-kDLInt = 0          // Signed integer
-kDLUInt = 1         // Unsigned integer
-kDLFloat = 2        // IEEE floating point
-kDLBfloat = 4       // Brain floating point
-
-// Example: float32 has code=2 (kDLFloat), bits=32, lanes=1
-// Example: half/fp16 has code=2, bits=16, lanes=1
-```
-
-## 2. Function API (tvm/ffi/function.h)
-
-### Function Class
-Type-erased callable object.
-
-**Constructors:**
-- `Function(std::nullptr_t)` - Null constructor
-- `template<typename TCallable> Function(TCallable packed_call)` - From callable (legacy)
-
-**Static Factory Methods:**
-- `template<typename TCallable> static Function FromPacked(TCallable packed_call)` - From packed signature: `void(const AnyView*, int32_t, Any*)` or `void(PackedArgs, Any*)`
-- `template<typename TCallable> static Function FromTyped(TCallable callable)` - From typed C++ function
-- `template<typename TCallable> static Function FromTyped(TCallable callable, std::string name)` - With name for error messages
-- `static Function FromExternC(void* self, TVMFFISafeCallType safe_call, void (*deleter)(void* self))` - From C callback
-
-**Global Registry:**
-- `static std::optional<Function> GetGlobal(std::string_view name)`
-- `static std::optional<Function> GetGlobal(const std::string& name)`
-- `static std::optional<Function> GetGlobal(const String& name)`
-- `static std::optional<Function> GetGlobal(const char* name)`
-- `static Function GetGlobalRequired(std::string_view name)` - Throws if not found
-- `static Function GetGlobalRequired(const std::string& name)`
-- `static Function GetGlobalRequired(const String& name)`
-- `static Function GetGlobalRequired(const char* name)`
-- `static void SetGlobal(std::string_view name, Function func, bool override = false)`
-- `static std::vector<String> ListGlobalNames()`
-- `static void RemoveGlobal(const String& name)`
-
-**Invocation:**
-- `template<typename... Args> Any operator()(Args&&... args) const` - Call with unpacked args
-- `void CallPacked(const AnyView* args, int32_t num_args, Any* result) const`
-- `void CallPacked(PackedArgs args, Any* result) const`
-- `template<typename... Args> static Any InvokeExternC(void* handle, TVMFFISafeCallType safe_call, Args&&... args)`
-
-### TypedFunction<R(Args...)> Class
-Type-safe wrapper around Function.
-
-**Constructors:**
-- `TypedFunction()` - Default
-- `TypedFunction(std::nullptr_t)`
-- `TypedFunction(Function packed)` - From Function
-- `template<typename FLambda> TypedFunction(FLambda typed_lambda)` - From lambda
-- `template<typename FLambda> TypedFunction(FLambda typed_lambda, std::string name)` - With name
-
-**Methods:**
-- `R operator()(Args... args) const` - Type-safe invocation
-- `operator Function() const` - Convert to Function
-- `const Function& packed() const&` - Get internal Function
-- `Function&& packed() &&` - Move internal Function
-- `static std::string TypeSchema()` - Get JSON type schema
-
-### PackedArgs Class
-Represents packed arguments.
-
-**Constructor:**
-- `PackedArgs(const AnyView* data, int32_t size)`
-
-**Methods:**
-- `int size() const` - Number of arguments
-- `const AnyView* data() const` - Raw argument array
-- `PackedArgs Slice(int begin, int end = -1) const` - Get subset
-- `AnyView operator[](int i) const` - Access argument
-- `template<typename... Args> static void Fill(AnyView* data, Args&&... args)` - Pack arguments
-
-### Export Macro
-```cpp
-// Export typed C++ function for FFI
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(export_name, cpp_function)
-
-// Example:
-void MyKernel(tvm::ffi::TensorView a, tvm::ffi::TensorView b) { ... }
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(my_kernel, MyKernel);
-```
-
-**Supported function signatures:**
-- `void func(TensorView t1, TensorView t2, ...)`
-- `void func(TensorView t, int64_t size, float alpha, ...)`
-- `int64_t func(TensorView t)`
-- Any combination of: `TensorView`, `int32_t`, `int64_t`, `float`, `double`, `bool`, `std::string`
-
-## 3. Error Handling API (tvm/ffi/error.h)
-
-### Error Class
-Exception object with stack trace.
-
-**Constructor:**
-- `Error(std::string kind, std::string message, std::string backtrace)`
-- `Error(std::string kind, std::string message, const TVMFFIByteArray* backtrace)`
-
-**Methods:**
-- `std::string kind() const` - Error category
-- `std::string message() const` - Error description
-- `std::string backtrace() const` - Stack trace
-- `std::string TracebackMostRecentCallLast() const` - Python-style traceback
-- `const char* what() const noexcept override` - Standard exception interface
-- `void UpdateBacktrace(const TVMFFIByteArray* backtrace_str, int32_t update_mode)` - Modify traceback
-
-### EnvErrorAlreadySet Exception
-Thrown when error exists in frontend environment (e.g., Python interrupt).
-
-**Usage:**
-```cpp
-void LongRunningFunction() {
-  if (TVMFFIEnvCheckSignals() != 0) {
-    throw ::tvm::ffi::EnvErrorAlreadySet();
-  }
-  // do work here
-}
-```
-
-### Error Macros
-```cpp
-// Throw with backtrace
-TVM_FFI_THROW(ErrorKind) << message
-
-// Log to stderr and throw (for startup functions)
-TVM_FFI_LOG_AND_THROW(ErrorKind) << message
-
-// Check C function return code
-TVM_FFI_CHECK_SAFE_CALL(func)
-
-// Wrap C++ code for C API
-TVM_FFI_SAFE_CALL_BEGIN();
-// c++ code region here
-TVM_FFI_SAFE_CALL_END();
-```
-
-### Assertion Macros
-```cpp
-TVM_FFI_ICHECK(condition) << "message"           // General check
-TVM_FFI_CHECK(condition, ErrorKind) << "message" // Custom error type
-TVM_FFI_ICHECK_EQ(x, y) << "x must equal y"      // x == y
-TVM_FFI_ICHECK_NE(x, y) << "x must not equal y"  // x != y
-TVM_FFI_ICHECK_LT(x, y) << "x must be less than y"     // x < y
-TVM_FFI_ICHECK_LE(x, y) << "x must be at most y"       // x <= y
-TVM_FFI_ICHECK_GT(x, y) << "x must be greater than y"  // x > y
-TVM_FFI_ICHECK_GE(x, y) << "x must be at least y"      // x >= y
-TVM_FFI_ICHECK_NOTNULL(ptr) << "ptr must not be null"  // ptr != nullptr
-```
-
-**Common Error Kinds:**
-- `ValueError` - Invalid argument values
-- `TypeError` - Type mismatch
-- `RuntimeError` - Runtime failures (CUDA errors, etc.)
-- `InternalError` - Internal logic errors
-
-### Utility Functions
-- `int32_t TypeKeyToIndex(std::string_view type_key)` - Get type index from key
-
-## 4. Environment API (tvm/ffi/extra/c_env_api.h)
-
-### Stream Management
-```cpp
-// Set current stream for a device
-int TVMFFIEnvSetStream(int32_t device_type, int32_t device_id,
-                       TVMFFIStreamHandle stream,
-                       TVMFFIStreamHandle* opt_out_original_stream)
-
-// Get current stream for a device
-TVMFFIStreamHandle TVMFFIEnvGetStream(int32_t device_type, int32_t device_id)
-
-// Usage example:
-DLDevice dev = tensor.device();
-cudaStream_t stream = static_cast<cudaStream_t>(
-    TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-```
-
-### Tensor Allocation
-```cpp
-// Set DLPack allocator (TLS or global)
-int TVMFFIEnvSetDLPackManagedTensorAllocator(
-    DLPackManagedTensorAllocator allocator,
-    int write_to_global_context,
-    DLPackManagedTensorAllocator* opt_out_original_allocator)
-
-// Get current allocator
-DLPackManagedTensorAllocator TVMFFIEnvGetDLPackManagedTensorAllocator()
-
-// Allocate tensor using environment allocator
-int TVMFFIEnvTensorAlloc(DLTensor* prototype, TVMFFIObjectHandle* out)
-
-// Usage with FromEnvAlloc:
-ffi::Tensor tensor = ffi::Tensor::FromEnvAlloc(
-    TVMFFIEnvTensorAlloc, shape, dtype, device);
-```
-
-### Module & Symbol Management
-```cpp
-// Lookup function from module imports
-int TVMFFIEnvModLookupFromImports(TVMFFIObjectHandle library_ctx,
-                                  const char* func_name,
-                                  TVMFFIObjectHandle* out)
-
-// Register context symbol (available when library is loaded)
-int TVMFFIEnvModRegisterContextSymbol(const char* name, void* symbol)
-
-// Register system library symbol
-int TVMFFIEnvModRegisterSystemLibSymbol(const char* name, void* symbol)
-
-// Register C API symbol
-int TVMFFIEnvRegisterCAPI(const char* name, void* symbol)
-```
-
-### Utilities
-```cpp
-// Check for environment signals (e.g., Python Ctrl+C)
-int TVMFFIEnvCheckSignals()
-```
-
-### Type Definitions
-```cpp
-typedef void* TVMFFIStreamHandle  // Stream handle type
-```
-
-# Common Patterns
-
-## Dtype Validation
-```cpp
-void MyKernel(tvm::ffi::TensorView x) {
-  DLDataType dt = x.dtype();
-  TVM_FFI_ICHECK_EQ(dt.code, kDLFloat) << "Expected float dtype";
-  TVM_FFI_ICHECK_EQ(dt.bits, 32) << "Expected 32-bit dtype";
-
-  // Or for user-facing errors:
-  if (dt.code != kDLFloat || dt.bits != 32) {
-    TVM_FFI_THROW(TypeError) << "Expected float32, got "
-                             << "code=" << dt.code << " bits=" << dt.bits;
-  }
-}
-```
-
-## Shape Validation
-```cpp
-void MatMul(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::TensorView c) {
-  TVM_FFI_ICHECK_EQ(a.ndim(), 2) << "a must be 2D";
-  TVM_FFI_ICHECK_EQ(b.ndim(), 2) << "b must be 2D";
-  TVM_FFI_ICHECK_EQ(a.size(1), b.size(0)) << "Shape mismatch for matmul";
-  TVM_FFI_ICHECK_EQ(c.size(0), a.size(0)) << "Output shape mismatch";
-  TVM_FFI_ICHECK_EQ(c.size(1), b.size(1)) << "Output shape mismatch";
-}
-```
-
-## Multi-dimensional Indexing
-```cpp
-void Process2D(tvm::ffi::TensorView x) {
-  int64_t height = x.size(0);
-  int64_t width = x.size(1);
-  float* data = static_cast<float*>(x.data_ptr());
-
-  // If contiguous (row-major), access as: data[i * width + j]
-  TVM_FFI_ICHECK(x.IsContiguous()) << "Expected contiguous tensor";
-
-  // With strides:
-  int64_t stride0 = x.stride(0);
-  int64_t stride1 = x.stride(1);
-  // Access: data[i * stride0 + j * stride1]
-}
-```
-
-## Device-specific Kernels
-```cpp
-void MyKernel(tvm::ffi::TensorView x) {
-  DLDevice dev = x.device();
-  if (dev.device_type == kDLCUDA) {
-    // Launch CUDA kernel
-    cudaStream_t stream = static_cast<cudaStream_t>(
-        TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-    // kernel<<<blocks, threads, 0, stream>>>(...);
-  } else if (dev.device_type == kDLCPU) {
-    // CPU implementation
-  } else {
-    TVM_FFI_THROW(RuntimeError) << "Unsupported device type: " << dev.device_type;
-  }
-}
-```
-
-## Allocating New Tensors
-```cpp
-void CreateOutput(tvm::ffi::TensorView input) {
-  // Create output tensor with same device and dtype as input
-  ffi::ShapeView shape = input.shape();
-  DLDataType dtype = input.dtype();
-  DLDevice device = input.device();
-
-  // Use environment allocator
-  ffi::Tensor output = ffi::Tensor::FromEnvAlloc(
-      TVMFFIEnvTensorAlloc, shape, dtype, device);
-}
-```
-"""

From ac2ee6ff69ec76c2ba293d137904d1667d95d570 Mon Sep 17 00:00:00 2001
From: zanderjiang <zander.jiang@gmail.com>
Date: Mon, 17 Nov 2025 10:53:56 -0500
Subject: [PATCH 04/10] agent vibecoding end-to-end example

---
 .../definitions/gemm_n4096_k4096.json         |  48 ++
 .../solutions/agent_vibecode_gemm.json        |   0
 .../workloads/gemm_n4096_k4096.jsonl          |  43 ++
 examples/ffi/agent.md                         | 419 ++++++++++++++++++
 examples/ffi/e2e_kernel.py                    |  69 +++
 flashinfer_bench/agents/ffi_prompt.py         |  93 ++--
 flashinfer_bench/agents/torch_load_inline.py  |  87 ----
 7 files changed, 625 insertions(+), 134 deletions(-)
 create mode 100644 examples/ffi/Example-FlashInfer-Trace/definitions/gemm_n4096_k4096.json
 create mode 100644 examples/ffi/Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json
 create mode 100644 examples/ffi/Example-FlashInfer-Trace/workloads/gemm_n4096_k4096.jsonl
 create mode 100644 examples/ffi/agent.md
 create mode 100644 examples/ffi/e2e_kernel.py
 delete mode 100644 flashinfer_bench/agents/torch_load_inline.py

diff --git a/examples/ffi/Example-FlashInfer-Trace/definitions/gemm_n4096_k4096.json b/examples/ffi/Example-FlashInfer-Trace/definitions/gemm_n4096_k4096.json
new file mode 100644
index 0000000..b3b4f61
--- /dev/null
+++ b/examples/ffi/Example-FlashInfer-Trace/definitions/gemm_n4096_k4096.json
@@ -0,0 +1,48 @@
+{
+  "name": "gemm_n4096_k4096",
+  "description": "General matrix multiply (GEMM) C = A @ B.T. Captured from Llama 3.1 8B attn.o_proj.",
+  "op_type": "gemm",
+  "tags": [
+    "status:verified",
+    "model:llama-3.1-8b"
+  ],
+  "axes": {
+    "M": {
+      "type": "var"
+    },
+    "N": {
+      "type": "const",
+      "value": 4096
+    },
+    "K": {
+      "type": "const",
+      "value": 4096
+    }
+  },
+  "inputs": {
+    "A": {
+      "shape": [
+        "M",
+        "K"
+      ],
+      "dtype": "float16"
+    },
+    "B": {
+      "shape": [
+        "N",
+        "K"
+      ],
+      "dtype": "float16"
+    }
+  },
+  "outputs": {
+    "C": {
+      "shape": [
+        "M",
+        "N"
+      ],
+      "dtype": "float16"
+    }
+  },
+  "reference": "import torch\n\ndef run(A, B):\n    C = torch.matmul(A, B.T)\n    return C"
+}
diff --git a/examples/ffi/Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json b/examples/ffi/Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json
new file mode 100644
index 0000000..e69de29
diff --git a/examples/ffi/Example-FlashInfer-Trace/workloads/gemm_n4096_k4096.jsonl b/examples/ffi/Example-FlashInfer-Trace/workloads/gemm_n4096_k4096.jsonl
new file mode 100644
index 0000000..eb8cfa0
--- /dev/null
+++ b/examples/ffi/Example-FlashInfer-Trace/workloads/gemm_n4096_k4096.jsonl
@@ -0,0 +1,43 @@
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "280860e6-08f0-427c-b7c5-9cffcfab1a10", "axes": {"M": 256}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "2e90109a-282e-484f-b94b-61f49e72fde2", "axes": {"M": 248}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "e7c939ae-2083-4b6f-a51a-8c76ffd08926", "axes": {"M": 240}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "3ab6479f-c1d0-4743-b2d4-1a46b01f9db7", "axes": {"M": 232}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "29581017-6470-4d78-9d02-554adecd9822", "axes": {"M": 224}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "e97cc8b3-9a2f-4d0d-aaa9-7522413c78da", "axes": {"M": 216}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "c69f1da6-7c62-46f5-867e-3cf5ed3aac04", "axes": {"M": 208}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "38a98eb2-3a86-41ce-994c-b6a2cec932a6", "axes": {"M": 200}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "62f7844c-b1f5-4e08-a057-70e55f092931", "axes": {"M": 192}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "1342c570-b505-4ccf-9fc2-b377d25b397e", "axes": {"M": 184}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "beb5da20-954c-44ef-8f1b-35a5dd848b06", "axes": {"M": 176}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "c8c9178e-65f2-4124-a322-e66987aa1b34", "axes": {"M": 168}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "c103a7db-34d0-4bc0-abb0-2833c2458c50", "axes": {"M": 160}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "ed0bbc00-57e9-46e0-af02-249a64a46fa0", "axes": {"M": 152}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "83deae78-6557-46e9-b3ef-2ed254192d13", "axes": {"M": 144}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "76fde2b6-3f6e-484f-bcd4-fca79272a690", "axes": {"M": 136}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "29ebd771-0f1c-4894-8532-7265275a02b1", "axes": {"M": 128}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "73dd121d-72c7-4dda-9b6e-37d55c6ed867", "axes": {"M": 120}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "27fa4c34-6e08-459a-a8c4-f37b7cdb037b", "axes": {"M": 112}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "faac134d-36e3-4f99-809c-9e544ea5216f", "axes": {"M": 104}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "1948bb55-6253-4b9c-aa17-4fb13ffef7d0", "axes": {"M": 96}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "3a7e6db4-4127-45c1-9e54-c8c7cc25d632", "axes": {"M": 88}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "dcb04a7e-2faa-4858-a495-71c658b299ad", "axes": {"M": 80}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "ef996a93-b3b6-4702-aae2-f28fdbcfdd48", "axes": {"M": 72}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "67d4c8f3-2ff5-4838-8f07-d5d16f602eb3", "axes": {"M": 64}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "54d34708-309d-462b-829a-74c90243093c", "axes": {"M": 56}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "59ca23f5-a523-4cc4-9c1f-db510753d3f4", "axes": {"M": 48}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "54062a8b-a9ca-47d8-b5ef-7f6f0325ef39", "axes": {"M": 40}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "5230e6ed-48b8-4765-bc9f-a7cdaabed615", "axes": {"M": 32}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "73212638-6584-476b-848d-2cb8ce0b829c", "axes": {"M": 24}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "404d21b1-2237-4e3c-b3ff-9b68878e5d70", "axes": {"M": 16}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "4cdab8cd-cb6b-4e73-8fe0-75b55fd784b5", "axes": {"M": 8}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "f439da26-2483-406c-977b-be185901207f", "axes": {"M": 4}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "6c2f4ba8-94d3-4e8f-997f-b7454242695a", "axes": {"M": 2}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "e39649a6-6f42-4a1b-9731-b45a9a87f7a5", "axes": {"M": 1}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "094ef833-829f-4efa-925d-d5bae9d6a116", "axes": {"M": 7}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "4c6bdefd-dd94-48b6-be3e-7eb25658eefd", "axes": {"M": 35}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "da2a2234-f5e9-4332-b62d-39865128153c", "axes": {"M": 972}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "897b6544-56c2-4d96-98fd-453ae3418e4b", "axes": {"M": 70}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "8a8311fa-8bb6-487d-8a36-7378e9680df8", "axes": {"M": 2053}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "b626104c-94cc-436b-9d2c-1d31432c1a87", "axes": {"M": 8192}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "547e0ce5-e484-4e0c-8b38-f153fb7ce6d4", "axes": {"M": 2379}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
+{"definition": "gemm_n4096_k4096", "solution": null, "workload": {"uuid": "339de815-896d-4d4c-8060-07208d559276", "axes": {"M": 15}, "inputs": {"A": {"type": "random"}, "B": {"type": "random"}}}, "evaluation": null}
diff --git a/examples/ffi/agent.md b/examples/ffi/agent.md
new file mode 100644
index 0000000..136d58b
--- /dev/null
+++ b/examples/ffi/agent.md
@@ -0,0 +1,419 @@
+# Agent Instructions: CUDA GEMM Implementation with TVM FFI
+
+## Task Overview
+
+Write a complete CUDA implementation solving the GEMM definition `gemm_n4096_k4096` and output it as a JSON file to:
+
+**Output Path**: `Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json`
+
+The implementation must use TVM FFI bindings and conform to the Solution JSON schema.
+
+## Target Operation
+
+**Operation**: General Matrix Multiply (GEMM)  
+**Formula**: `C = A @ B.T`  
+**Shapes**:
+- A: `[M, K]` where M is variable, K = 4096
+- B: `[N, K]` where N = 4096, K = 4096  
+- C: `[M, N]` (output)
+- **Data type**: `float16` (FP16)
+
+**Note**: This is computing `A @ B.T` (transpose of B), not `A @ B`.
+
+## Solution Structure Requirements
+
+Your solution **must** include exactly 3 source files with these names:
+
+1. **`kernel.h`**: Header file with function declarations and shared definitions
+2. **`kernel.cu`**: CUDA kernel device code implementation
+3. **`main.cpp`**: TVM FFI host code with bindings
+
+## TVM FFI Requirements
+
+### Required Headers in main.cpp
+```cpp
+#include <tvm/ffi/container/tensor.h>   // TensorView: tensor arguments
+#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
+#include <tvm/ffi/error.h>              // TVM_FFI_ICHECK, TVM_FFI_THROW
+#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream
+#include <cuda_fp16.h>
+#include "kernel.h"
+```
+
+### Function Signature
+The exported function **must** be named `run` and match the definition's input/output names:
+
+```cpp
+void run(tvm::ffi::TensorView A, tvm::ffi::TensorView B, tvm::ffi::TensorView C);
+```
+
+**Important**: The function takes A, B, and C as parameters. C is pre-allocated by the caller.
+
+### TVM FFI Binding
+Use TVM_FFI_DLL_EXPORT_TYPED_FUNC to expose the function:
+
+```cpp
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(run, run);
+```
+
+### Input Validation
+Validate inputs using TVM FFI error handling:
+
+```cpp
+// Check dimensions
+TVM_FFI_ICHECK_EQ(A.ndim(), 2) << "A must be 2D";
+TVM_FFI_ICHECK_EQ(B.ndim(), 2) << "B must be 2D";
+TVM_FFI_ICHECK_EQ(C.ndim(), 2) << "C must be 2D";
+
+// Check shapes
+TVM_FFI_ICHECK_EQ(A.size(1), 4096) << "A.shape[1] must be 4096 (K)";
+TVM_FFI_ICHECK_EQ(B.size(0), 4096) << "B.shape[0] must be 4096 (N)";
+TVM_FFI_ICHECK_EQ(B.size(1), 4096) << "B.shape[1] must be 4096 (K)";
+
+// Check shape compatibility
+TVM_FFI_ICHECK_EQ(A.size(1), B.size(1)) << "K dimension mismatch";
+TVM_FFI_ICHECK_EQ(C.size(0), A.size(0)) << "M dimension mismatch";
+TVM_FFI_ICHECK_EQ(C.size(1), B.size(0)) << "N dimension mismatch";
+
+// Check data types (float16)
+TVM_FFI_ICHECK_EQ(A.dtype().code, kDLFloat) << "A must be float type";
+TVM_FFI_ICHECK_EQ(A.dtype().bits, 16) << "A must be float16";
+TVM_FFI_ICHECK_EQ(B.dtype().code, kDLFloat) << "B must be float type";
+TVM_FFI_ICHECK_EQ(B.dtype().bits, 16) << "B must be float16";
+TVM_FFI_ICHECK_EQ(C.dtype().code, kDLFloat) << "C must be float type";
+TVM_FFI_ICHECK_EQ(C.dtype().bits, 16) << "C must be float16";
+
+// Check device (must be CUDA)
+TVM_FFI_ICHECK_EQ(A.device().device_type, kDLCUDA) << "A must be on CUDA";
+TVM_FFI_ICHECK_EQ(B.device().device_type, kDLCUDA) << "B must be on CUDA";
+TVM_FFI_ICHECK_EQ(C.device().device_type, kDLCUDA) << "C must be on CUDA";
+```
+
+### CUDA Stream Management
+Get the CUDA stream from TVM FFI environment:
+
+```cpp
+DLDevice dev = A.device();
+cudaStream_t stream = static_cast<cudaStream_t>(
+    TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+
+// Launch kernel on the stream
+kernel_launch<<<grid, block, 0, stream>>>(args...);
+```
+
+### Memory Access
+Access tensor data through TensorView API:
+
+```cpp
+const __half* A_data = static_cast<const __half*>(A.data_ptr());
+const __half* B_data = static_cast<const __half*>(B.data_ptr());
+__half* C_data = static_cast<__half*>(C.data_ptr());
+
+int64_t M = A.size(0);
+int64_t K = A.size(1);
+int64_t N = B.size(0);
+```
+
+## CUDA Kernel Implementation Guidelines
+
+### Recommended Approach
+Implement a tiled GEMM kernel optimized for float16:
+
+1. **Use shared memory** for tile caching
+2. **Leverage Tensor Cores** if targeting modern GPUs (use `__half` or `half2`)
+3. **Thread block tiling**: Typical tile sizes like 128×128 or 256×128
+4. **Handle transposition**: Since we compute `A @ B.T`, adjust memory access patterns
+
+### Kernel Signature Example
+```cpp
+__global__ void gemm_kernel_device(
+    const half* __restrict__ A,
+    const half* __restrict__ B,
+    half* __restrict__ C,
+    int M, int N, int K
+);
+```
+
+### Performance Considerations
+- Use `__half` or `half2` types for FP16 operations
+- Ensure coalesced memory access
+- Minimize bank conflicts in shared memory
+- Consider using warp-level primitives for reductions
+
+## File Organization
+
+### Required File Structure
+
+**File 1: `kernel.h`**
+- CUDA kernel function declarations
+- Host launcher function declarations
+- Shared constants and type definitions
+- Include guards
+
+**File 2: `kernel.cu`**
+- `__global__` kernel implementations
+- `__device__` helper functions
+- Host-side kernel launcher function
+- CUDA-specific optimizations (shared memory, tensor cores, etc.)
+
+**File 3: `main.cpp`**
+- TVM FFI bindings
+- `run` function that matches definition signature: `void run(TensorView A, TensorView B, TensorView C)`
+- Input validation using `TVM_FFI_ICHECK_*` macros
+- Stream management via `TVMFFIEnvGetStream()`
+- `TVM_FFI_DLL_EXPORT_TYPED_FUNC` for function export
+
+## JSON Schema Format
+
+The output JSON must conform to the Solution schema and be written to:
+
+**`Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json`**
+
+### JSON Structure
+
+```json
+{
+  "name": "agent_vibecode_gemm",
+  "definition": "gemm_n4096_k4096",
+  "description": "High-performance CUDA GEMM implementation for C = A @ B.T using TVM FFI bindings",
+  "author": "vibecode-agent",
+  "spec": {
+    "language": "cuda",
+    "target_hardware": [
+      "NVIDIA_H100",
+      "NVIDIA_A100"
+    ],
+    "dependencies": [],
+    "entry_point": "main.cpp::run"
+  },
+  "sources": [
+    {
+      "path": "kernel.h",
+      "content": "... complete header file content as string ..."
+    },
+    {
+      "path": "kernel.cu",
+      "content": "... complete CUDA kernel code as string ..."
+    },
+    {
+      "path": "main.cpp",
+      "content": "... complete TVM FFI binding code as string ..."
+    }
+  ]
+}
+```
+
+### Critical Schema Fields
+
+| Field | Value | Notes |
+|-------|-------|-------|
+| `name` | `"agent_vibecode_gemm"` | Unique identifier for this solution |
+| `definition` | `"gemm_n4096_k4096"` | **Must** match the definition name exactly |
+| `language` | `"cuda"` | Lowercase, primary language |
+| `target_hardware` | Array of strings | e.g., `["NVIDIA_H100", "NVIDIA_A100"]` |
+| `entry_point` | `"main.cpp::run"` | Format: `{filename}::{function_name}` |
+| `sources` | Array of 3 file objects | Each with `path` and `content` fields |
+
+### Entry Point Convention
+
+The entry point specifies which function the benchmarker will call:
+- Format: `"main.cpp::run"`
+- The function `run` must be exposed via `TVM_FFI_DLL_EXPORT_TYPED_FUNC`
+- The benchmarker will:
+  1. Compile all source files into a TVM FFI shared library
+  2. Load the compiled module using TVM FFI
+  3. Call the `run` function with test inputs `A`, `B`, and pre-allocated `C`
+  4. Validate the output C against the reference
+
+## Complete Implementation Example
+
+Below is a skeleton showing the structure of all three files:
+
+### kernel.h
+```cpp
+#ifndef GEMM_N4096_K4096_KERNEL_H
+#define GEMM_N4096_K4096_KERNEL_H
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cstdint>
+
+// Constants from definition
+constexpr int GEMM_N_CONST = 4096;
+constexpr int GEMM_K_CONST = 4096;
+
+// Kernel launcher function
+void gemm_n4096_k4096_launch(
+    const __half* A,
+    const __half* B,
+    __half* C,
+    int M,
+    cudaStream_t stream
+);
+
+#endif // GEMM_N4096_K4096_KERNEL_H
+```
+
+### kernel.cu
+```cpp
+#include "kernel.h"
+#include <mma.h>  // For tensor cores
+
+using namespace nvcuda;
+
+// Kernel configuration
+constexpr int BLOCK_M = 128;
+constexpr int BLOCK_N = 256;
+constexpr int BLOCK_K = 64;
+
+__global__ void gemm_kernel(
+    const __half* __restrict__ A,
+    const __half* __restrict__ B,
+    __half* __restrict__ C,
+    int M
+) {
+    // Implement optimized GEMM with:
+    // - Shared memory tiling
+    // - WMMA/Tensor Core operations
+    // - Coalesced memory access
+    // - Proper synchronization
+    
+    // C = A @ B.T
+    // A is [M, 4096], B is [4096, 4096], C is [M, 4096]
+}
+
+void gemm_n4096_k4096_launch(
+    const __half* A,
+    const __half* B,
+    __half* C,
+    int M,
+    cudaStream_t stream
+) {
+    if (M <= 0) return;
+    
+    dim3 block(256);
+    dim3 grid((GEMM_N_CONST + BLOCK_N - 1) / BLOCK_N,
+              (M + BLOCK_M - 1) / BLOCK_M);
+    
+    gemm_kernel<<<grid, block, 0, stream>>>(A, B, C, M);
+}
+```
+
+### main.cpp
+```cpp
+#include <tvm/ffi/container/tensor.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/error.h>
+#include <tvm/ffi/extra/c_env_api.h>
+#include <cuda_fp16.h>
+#include "kernel.h"
+
+void run(tvm::ffi::TensorView A, tvm::ffi::TensorView B, tvm::ffi::TensorView C) {
+    // Input validation - dimensions
+    TVM_FFI_ICHECK_EQ(A.ndim(), 2) << "A must be 2D";
+    TVM_FFI_ICHECK_EQ(B.ndim(), 2) << "B must be 2D";
+    TVM_FFI_ICHECK_EQ(C.ndim(), 2) << "C must be 2D";
+    
+    // Get dimensions
+    int64_t M = A.size(0);
+    int64_t K = A.size(1);
+    int64_t N = B.size(0);
+    
+    // Check shapes
+    TVM_FFI_ICHECK_EQ(K, 4096) << "A.shape[1] must be 4096 (K)";
+    TVM_FFI_ICHECK_EQ(N, 4096) << "B.shape[0] must be 4096 (N)";
+    TVM_FFI_ICHECK_EQ(B.size(1), 4096) << "B.shape[1] must be 4096 (K)";
+    TVM_FFI_ICHECK_EQ(C.size(0), M) << "C.shape[0] must match A.shape[0] (M)";
+    TVM_FFI_ICHECK_EQ(C.size(1), N) << "C.shape[1] must be 4096 (N)";
+    
+    // Check data types (float16)
+    TVM_FFI_ICHECK_EQ(A.dtype().code, kDLFloat) << "A must be float type";
+    TVM_FFI_ICHECK_EQ(A.dtype().bits, 16) << "A must be float16";
+    TVM_FFI_ICHECK_EQ(B.dtype().code, kDLFloat) << "B must be float type";
+    TVM_FFI_ICHECK_EQ(B.dtype().bits, 16) << "B must be float16";
+    TVM_FFI_ICHECK_EQ(C.dtype().code, kDLFloat) << "C must be float type";
+    TVM_FFI_ICHECK_EQ(C.dtype().bits, 16) << "C must be float16";
+    
+    // Check device (must be CUDA)
+    TVM_FFI_ICHECK_EQ(A.device().device_type, kDLCUDA) << "A must be on CUDA";
+    TVM_FFI_ICHECK_EQ(B.device().device_type, kDLCUDA) << "B must be on CUDA";
+    TVM_FFI_ICHECK_EQ(C.device().device_type, kDLCUDA) << "C must be on CUDA";
+    
+    // Get data pointers
+    const __half* A_data = static_cast<const __half*>(A.data_ptr());
+    const __half* B_data = static_cast<const __half*>(B.data_ptr());
+    __half* C_data = static_cast<__half*>(C.data_ptr());
+    
+    // Get CUDA stream from TVM FFI environment
+    DLDevice dev = A.device();
+    cudaStream_t stream = static_cast<cudaStream_t>(
+        TVMFFIEnvGetStream(dev.device_type, dev.device_id));
+    
+    // Launch kernel
+    gemm_n4096_k4096_launch(A_data, B_data, C_data, static_cast<int>(M), stream);
+}
+
+// Export the function with TVM FFI
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(run, run);
+```
+
+## Performance Optimization Guidelines
+
+Your CUDA kernel should include:
+
+1. **Tensor Core Usage (WMMA)**: Use `nvcuda::wmma` for 16x16x16 matrix operations
+2. **Shared Memory Tiling**: Cache tiles of A and B in shared memory
+3. **Memory Coalescing**: Ensure threads access consecutive memory addresses
+4. **Bank Conflict Avoidance**: Add padding to shared memory arrays
+5. **Compute Intensity**: Maximize compute-to-memory-access ratio
+6. **Register Optimization**: Minimize register usage for higher occupancy
+7. **Stream Pipelining**: Overlap compute and memory operations
+
+## Output Format
+
+Write the complete JSON solution to:
+**`Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json`**
+
+The JSON must be valid and contain:
+- All required schema fields
+- Complete source code for all 3 files in the `content` fields
+- Properly escaped strings (use JSON encoding)
+
+## Validation Checklist
+
+Before finalizing, verify:
+- [ ] File names are exactly: `kernel.h`, `kernel.cu`, `main.cpp`
+- [ ] Entry point is `"main.cpp::run"`
+- [ ] Function signature: `void run(tvm::ffi::TensorView A, tvm::ffi::TensorView B, tvm::ffi::TensorView C)`
+- [ ] TVM_FFI_DLL_EXPORT_TYPED_FUNC exposes the `run` function
+- [ ] All three files included in `sources` array
+- [ ] Input validation with `TVM_FFI_ICHECK_*` macros
+- [ ] Kernel implements `C = A @ B.T` (transpose of B)
+- [ ] Data type is `__half` (float16)
+- [ ] CUDA stream from `TVMFFIEnvGetStream()`
+- [ ] Checks that all tensors are on CUDA device
+- [ ] JSON is valid and properly formatted
+- [ ] All TVM FFI headers included correctly
+
+## Expected Agent Behavior
+
+1. **Read** the GEMM definition from `definitions/gemm_n4096_k4096.json`
+2. **Understand** the operation: `C = A @ B.T` with shapes [M,K] × [N,K] → [M,N]
+3. **Implement** a high-performance CUDA kernel with tiling and tensor cores
+4. **Create** TVM FFI bindings following the API guidelines
+5. **Package** all source code into the Solution JSON format
+6. **Write** the JSON to `Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json`
+
+The JSON file should be ready to be consumed by the flashinfer-bench benchmarking system.
+
+## Summary
+
+This agent.md provides complete instructions for generating a CUDA GEMM kernel implementation using TVM FFI bindings. The key points are:
+
+- **3 files required**: `kernel.h`, `kernel.cu`, `main.cpp`
+- **Entry point**: `main.cpp::run` with signature `void run(TensorView A, TensorView B, TensorView C)`
+- **TVM FFI export**: Use `TVM_FFI_DLL_EXPORT_TYPED_FUNC(run, run)`
+- **Validation**: Use `TVM_FFI_ICHECK_*` macros for input validation
+- **Stream management**: Get stream via `TVMFFIEnvGetStream()`
+- **Output**: Write complete JSON to `Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json`
+
diff --git a/examples/ffi/e2e_kernel.py b/examples/ffi/e2e_kernel.py
new file mode 100644
index 0000000..fe3e719
--- /dev/null
+++ b/examples/ffi/e2e_kernel.py
@@ -0,0 +1,69 @@
+"""
+End-to-end example script, runs benchmark on agent generated gemm kernel, and applies it to a test input.
+"""
+import torch
+from flashinfer_bench import (
+    Benchmark,
+    BenchmarkConfig,
+    TraceSet,
+    apply,
+    enable_apply,
+    disable_apply,
+)
+
+def main():
+    traceset_path = "Example-FlashInfer-Trace"
+    
+    print(f"Loading Example-FlashInfer-Trace")
+    traceset = TraceSet.from_path(traceset_path)
+    
+    # Run benchmark
+    print("\nRunning Benchmark on agent generated gemm kernel")
+    benchmark = Benchmark(traceset, BenchmarkConfig())
+    result_traceset = benchmark.run_all()
+    
+    print("\nBenchmark Complete")
+    for def_name, traces in result_traceset.traces.items():
+        print(f"\n{def_name}:")
+        for trace in traces:
+            print(f"  Solution: {trace.solution.name}")
+            print(f"  Status: {trace.evaluation.status.value}")
+            if trace.evaluation.performance:
+                print(f"  Speedup: {trace.evaluation.performance.speedup_factor:.2f}x")
+    
+    # Apply kernel
+    print("\nApplying generated kernel on test input")
+    
+    with enable_apply(traceset_path):
+        # Random test inputs for gemm_n4096_k4096
+        # A: [M, K] = [1024, 4096]
+        # B: [N, K] = [4096, 4096]
+        # C = A @ B.T -> [1024, 4096]
+        
+        M, N, K = 1024, 4096, 4096        
+        A = torch.randn(M, K, dtype=torch.float16, device="cuda")
+        B = torch.randn(N, K, dtype=torch.float16, device="cuda")
+        
+        def reference_gemm(A, B):
+            return torch.matmul(A, B.T)        
+        ref_sol = reference_gemm(A, B)
+        
+        test_sol = apply(
+            "gemm_n4096_k4096",
+            runtime_kwargs={"A": A, "B": B},
+            fallback=reference_gemm
+        )
+        
+        if test_sol is not None:
+            diff = torch.abs(ref_sol - test_sol).max().item()
+            print(f"Max difference vs reference: {diff}")
+            if diff < 1e-2:
+                print("Vibe coded kernel is correct")
+            else:
+                print("Vibe coded kernel differs from torch matmul")
+        else:
+            print("No result returned (using fallback)")    
+
+if __name__ == "__main__":
+    main()
+
diff --git a/flashinfer_bench/agents/ffi_prompt.py b/flashinfer_bench/agents/ffi_prompt.py
index ff09517..ab80df3 100644
--- a/flashinfer_bench/agents/ffi_prompt.py
+++ b/flashinfer_bench/agents/ffi_prompt.py
@@ -240,11 +240,11 @@
   int64_t channels = input.size(1);
   int64_t height = input.size(2);
   int64_t width = input.size(3);
-  
+
   // Get data pointer
   float* in_data = static_cast<float*>(input.data_ptr());
   float* out_data = static_cast<float*>(output.data_ptr());
-  
+
   // Check device
   DLDevice dev = input.device();
   if (dev.device_type == kDLCUDA) {
@@ -262,8 +262,8 @@
 **Static Factory Methods:**
 ```cpp
 // From DLPack
-static Tensor FromDLPack(DLManagedTensor* tensor, 
-                         size_t require_alignment = 0, 
+static Tensor FromDLPack(DLManagedTensor* tensor,
+                         size_t require_alignment = 0,
                          bool require_contiguous = false)
 
 static Tensor FromDLPackVersioned(DLManagedTensorVersioned* tensor,
@@ -272,8 +272,8 @@
 
 // Allocate from environment (recommended for kernel outputs)
 static Tensor FromEnvAlloc(int (*env_alloc)(DLTensor*, TVMFFIObjectHandle*),
-                          ffi::ShapeView shape, 
-                          DLDataType dtype, 
+                          ffi::ShapeView shape,
+                          DLDataType dtype,
                           DLDevice device)
 
 // Example: Allocate tensor using environment allocator
@@ -437,7 +437,7 @@
 
 **Purpose:** Hash map container with copy-on-write semantics. Similar to `std::unordered_map`.
 
-**Type Parameters:** 
+**Type Parameters:**
 - `K` (key type) must be hashable and compatible with Any
 - `V` (value type) must be compatible with Any
 
@@ -776,7 +776,7 @@
 DLDataType StringToDLDataType(const String& str)
 // Examples: "float32", "int64", "float16", "uint8"
 
-// DLDataType to String  
+// DLDataType to String
 String DLDataTypeToString(DLDataType dtype)
 
 // Get type code as string (internal)
@@ -819,7 +819,7 @@
 class Error : public std::exception {
 public:
   Error(std::string kind, std::string message, std::string backtrace);
-  
+
   std::string kind() const;                    // Error category
   std::string message() const;                 // Error message
   std::string backtrace() const;               // Stack trace
@@ -901,12 +901,12 @@ class Error : public std::exception {
   if (x.ndim() != 2) {
     TVM_FFI_THROW(ValueError) << "Expected 2D tensor, got " << x.ndim() << "D";
   }
-  
+
   DLDataType dt = x.dtype();
   if (dt.code != kDLFloat || dt.bits != 32) {
     TVM_FFI_THROW(TypeError) << "Expected float32, got " << DLDataTypeToString(dt);
   }
-  
+
   // Internal consistency checks
   TVM_FFI_ICHECK_EQ(x.size(1), y.size(0)) << "Dimension mismatch for matmul";
 }
@@ -923,7 +923,7 @@ class Error : public std::exception {
 TVM_FFI_DLL_EXPORT_TYPED_FUNC(export_name, cpp_function)
 
 // Example:
-void MyKernel(tvm::ffi::TensorView a, tvm::ffi::TensorView b) { 
+void MyKernel(tvm::ffi::TensorView a, tvm::ffi::TensorView b) {
   // kernel implementation
 }
 
@@ -972,24 +972,24 @@ class Function {
   // Get global function by name
   static std::optional<Function> GetGlobal(std::string_view name);
   static Function GetGlobalRequired(std::string_view name);  // Throws if not found
-  
+
   // Set global function
   static void SetGlobal(std::string_view name, Function func, bool override = false);
-  
+
   // List all global function names
   static std::vector<String> ListGlobalNames();
-  
+
   // Remove global function
   static void RemoveGlobal(const String& name);
-  
+
   // Call function with arguments
   template<typename... Args>
   Any operator()(Args&&... args) const;
-  
+
   // Create from typed function
   template<typename TCallable>
   static Function FromTyped(TCallable callable);
-  
+
   template<typename TCallable>
   static Function FromTyped(TCallable callable, std::string name);
 };
@@ -1035,20 +1035,20 @@ class Module {
 public:
   // Load module from file
   static Module LoadFromFile(const String& file_name);
-  
+
   // Get function from module
   Optional<Function> GetFunction(const String& name, bool query_imports = true);
-  
+
   // Check if function exists
   bool ImplementsFunction(const String& name, bool query_imports = true);
-  
+
   // Get function metadata
   Optional<String> GetFunctionDoc(const String& name, bool query_imports = true);
   Optional<String> GetFunctionMetadata(const String& name, bool query_imports = true);
-  
+
   // Import another module
   void ImportModule(const Module& other);
-  
+
   // Export module
   void WriteToFile(const String& file_name, const String& format) const;
   Bytes SaveToBytes() const;
@@ -1104,7 +1104,7 @@ class Module {
 TVMFFIStreamHandle TVMFFIEnvGetStream(int32_t device_type, int32_t device_id)
 
 // Set current CUDA stream for device
-int TVMFFIEnvSetStream(int32_t device_type, 
+int TVMFFIEnvSetStream(int32_t device_type,
                        int32_t device_id,
                        TVMFFIStreamHandle stream,
                        TVMFFIStreamHandle* opt_out_original_stream)
@@ -1134,7 +1134,7 @@ class Module {
 
 // Usage: Allocate output tensor
 ffi::Tensor output = ffi::Tensor::FromEnvAlloc(
-    TVMFFIEnvTensorAlloc, 
+    TVMFFIEnvTensorAlloc,
     shape,   // ffi::ShapeView
     dtype,   // DLDataType
     device   // DLDevice
@@ -1200,23 +1200,23 @@ class Module {
   TVM_FFI_ICHECK_EQ(c.ndim(), 1) << "c must be 1D";
   TVM_FFI_ICHECK_EQ(a.size(0), b.size(0)) << "Shape mismatch";
   TVM_FFI_ICHECK_EQ(a.size(0), c.size(0)) << "Shape mismatch";
-  
+
   // Check dtype
   if (a.dtype().code != kDLFloat || a.dtype().bits != 32) {
     TVM_FFI_THROW(TypeError) << "Expected float32 tensor";
   }
-  
+
   // Get data
   float* a_data = static_cast<float*>(a.data_ptr());
   float* b_data = static_cast<float*>(b.data_ptr());
   float* c_data = static_cast<float*>(c.data_ptr());
   int64_t n = a.size(0);
-  
+
   // Get CUDA stream
   DLDevice dev = a.device();
   cudaStream_t stream = static_cast<cudaStream_t>(
       TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-  
+
   // Launch kernel
   int64_t threads = 256;
   int64_t blocks = (n + threads - 1) / threads;
@@ -1233,20 +1233,20 @@ class Module {
   // Validate
   TVM_FFI_ICHECK_EQ(input.ndim(), 2) << "Expected 2D tensor";
   TVM_FFI_ICHECK_EQ(output.ndim(), 2) << "Expected 2D tensor";
-  
+
   // Get dimensions
   int64_t height = input.size(0);
   int64_t width = input.size(1);
   TVM_FFI_ICHECK_EQ(output.size(0), height);
   TVM_FFI_ICHECK_EQ(output.size(1), width);
-  
+
   // Check contiguous
   TVM_FFI_ICHECK(input.IsContiguous()) << "Input must be contiguous";
-  
+
   // Access with strides (more general)
   int64_t stride_h = input.stride(0);
   int64_t stride_w = input.stride(1);
-  
+
   // For contiguous tensors: element[i][j] at data[i * stride_h + j * stride_w]
 }
 ```
@@ -1254,12 +1254,12 @@ class Module {
 ## Pattern 3: Allocating Output Tensors
 
 ```cpp
-tvm::ffi::Tensor AllocateOutput(tvm::ffi::TensorView input, 
+tvm::ffi::Tensor AllocateOutput(tvm::ffi::TensorView input,
                                 tvm::ffi::ShapeView output_shape) {
   // Create output with same device and dtype as input
   DLDevice device = input.device();
   DLDataType dtype = input.dtype();
-  
+
   // Allocate using environment allocator
   tvm::ffi::Tensor output = tvm::ffi::Tensor::FromEnvAlloc(
       TVMFFIEnvTensorAlloc,
@@ -1267,7 +1267,7 @@ class Module {
       dtype,
       device
   );
-  
+
   return output;
 }
 
@@ -1275,10 +1275,10 @@ class Module {
 void MyKernel(tvm::ffi::TensorView input) {
   // Create output shape
   Shape out_shape = {input.size(0), input.size(1) * 2};
-  
+
   // Allocate
   Tensor output = AllocateOutput(input, out_shape);
-  
+
   // Now use output.data_ptr() in kernel
 }
 ```
@@ -1288,7 +1288,7 @@ class Module {
 ```cpp
 void UniversalKernel(tvm::ffi::TensorView input, tvm::ffi::TensorView output) {
   DLDevice dev = input.device();
-  
+
   switch (dev.device_type) {
     case kDLCUDA: {
       // CUDA implementation
@@ -1323,13 +1323,13 @@ class Module {
   // Extract config with defaults
   int64_t block_size = config.Get("block_size", 256);
   int64_t num_threads = config.Get("num_threads", 1024);
-  
+
   // Or check if key exists
   if (config.count("custom_param") > 0) {
     int64_t custom = config.at("custom_param");
     // use custom
   }
-  
+
   // Launch with config
   // ...
 }
@@ -1341,15 +1341,15 @@ class Module {
 
 ```cpp
 // Return tuple
-tvm::ffi::Tuple<tvm::ffi::Tensor, int64_t, float> 
+tvm::ffi::Tuple<tvm::ffi::Tensor, int64_t, float>
 ComputeWithStats(tvm::ffi::TensorView input) {
   // Allocate output
   tvm::ffi::Tensor output = AllocateOutput(input, input.shape());
-  
+
   // Compute statistics
   int64_t count = input.numel();
   float mean = 0.0f;  // ... compute mean
-  
+
   // Return multiple values
   return {output, count, mean};
 }
@@ -1363,7 +1363,7 @@ class Module {
 void BatchProcess(tvm::ffi::Array<tvm::ffi::TensorView> inputs,
                  tvm::ffi::Array<tvm::ffi::TensorView> outputs) {
   TVM_FFI_ICHECK_EQ(inputs.size(), outputs.size()) << "Size mismatch";
-  
+
   for (size_t i = 0; i < inputs.size(); ++i) {
     TensorView in = inputs[i];
     TensorView out = outputs[i];
@@ -1423,7 +1423,7 @@ class Module {
 DLDevice dev = tensor.device();
 cudaStream_t stream = static_cast<cudaStream_t>(
     TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-    
+
 float* data = static_cast<float*>(tensor.data_ptr());
 int64_t n = tensor.numel();
 
@@ -1866,4 +1866,3 @@ class Module {
 }
 ```
 """
-
diff --git a/flashinfer_bench/agents/torch_load_inline.py b/flashinfer_bench/agents/torch_load_inline.py
deleted file mode 100644
index 5cb8c7c..0000000
--- a/flashinfer_bench/agents/torch_load_inline.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import torch
-from torch.utils.cpp_extension import load_inline
-
-cuda_source = """
-#include <torch/extension.h>
-#include <cuda_runtime.h>
-#include <c10/cuda/CUDAStream.h>
-
-namespace torch_example_cuda {
-
-__global__ void AddOneKernel(float* x, float* y, int n) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    y[idx] = x[idx] + 1;
-  }
-}
-
-torch::Tensor add_one_cuda(torch::Tensor x) {
-  TORCH_CHECK(x.is_cuda(), "x must be a CUDA tensor");
-  TORCH_CHECK(x.is_contiguous(), "x must be contiguous");
-  TORCH_CHECK(x.dtype() == torch::kFloat32, "x must be float32");
-  
-  auto y = torch::empty_like(x);
-  
-  int64_t n = x.numel();
-  float* x_data = x.data_ptr<float>();
-  float* y_data = y.data_ptr<float>();
-  
-  int64_t threads = 256;
-  int64_t blocks = (n + threads - 1) / threads;
-  
-  cudaStream_t stream = c10::cuda::getCurrentCUDAStream();
-  
-  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
-  
-  return y;
-}
-
-}  // namespace torch_example_cuda
-"""
-
-cpp_source = """
-#include <torch/extension.h>
-
-namespace torch_example_cuda {
-torch::Tensor add_one_cuda(torch::Tensor x);
-}
-
-torch::Tensor add_one(torch::Tensor x) {
-  return torch_example_cuda::add_one_cuda(x);
-}
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("add_one", &add_one, "Add one to each element (CUDA)");
-}
-"""
-
-
-def main():
-    print("Compiling CUDA extension with torch.utils.cpp_extension.load_inline...")
-    
-    module = load_inline(
-        name="add_one_cuda",
-        cpp_sources=cpp_source,
-        cuda_sources=cuda_source,
-        verbose=True,
-        with_cuda=True,
-    )
-    
-    print("Compilation successful!")
-    
-    # Test the compiled module
-    print("\nTesting the module...")
-    x = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], device="cuda")
-    print(f"Input:  {x}")
-    
-    y = module.add_one(x)
-    print(f"Output: {y}")
-    
-    expected = x + 1
-    print(f"Expected: {expected}")
-    print(f"Match: {torch.allclose(y, expected)}")
-
-
-if __name__ == "__main__":
-    main()
-

From 2f5af2faed49d0d6c799c43ff1a4955fec9cb18d Mon Sep 17 00:00:00 2001
From: zanderjiang <zander.jiang@gmail.com>
Date: Mon, 17 Nov 2025 11:15:08 -0500
Subject: [PATCH 05/10] reformat tests

---
 examples/ffi/agent.md                         | 27 +++++++-------
 examples/ffi/e2e_kernel.py                    | 36 ++++++++++---------
 .../agents => tests/agent}/load_inline.py     |  9 +++--
 .../agent}/test_existing_prompt_sol.py        | 11 ------
 .../agents => tests/agent}/test_prompt.py     |  4 +--
 .../test_results/claude-opus-4-1-20250805.txt |  0
 .../agent}/test_results/gpt-5-2025-08-07.txt  |  0
 .../test_results/gpt-5-mini-2025-08-07.txt    |  0
 .../agent}/test_results/o3.txt                |  0
 .../test_results/o4-mini-2025-04-16.txt       |  0
 10 files changed, 40 insertions(+), 47 deletions(-)
 rename {flashinfer_bench/agents => tests/agent}/load_inline.py (91%)
 rename {flashinfer_bench/agents => tests/agent}/test_existing_prompt_sol.py (95%)
 rename {flashinfer_bench/agents => tests/agent}/test_prompt.py (98%)
 rename {flashinfer_bench/agents => tests/agent}/test_results/claude-opus-4-1-20250805.txt (100%)
 rename {flashinfer_bench/agents => tests/agent}/test_results/gpt-5-2025-08-07.txt (100%)
 rename {flashinfer_bench/agents => tests/agent}/test_results/gpt-5-mini-2025-08-07.txt (100%)
 rename {flashinfer_bench/agents => tests/agent}/test_results/o3.txt (100%)
 rename {flashinfer_bench/agents => tests/agent}/test_results/o4-mini-2025-04-16.txt (100%)

diff --git a/examples/ffi/agent.md b/examples/ffi/agent.md
index 136d58b..0523307 100644
--- a/examples/ffi/agent.md
+++ b/examples/ffi/agent.md
@@ -10,11 +10,11 @@ The implementation must use TVM FFI bindings and conform to the Solution JSON sc
 
 ## Target Operation
 
-**Operation**: General Matrix Multiply (GEMM)  
-**Formula**: `C = A @ B.T`  
+**Operation**: General Matrix Multiply (GEMM)
+**Formula**: `C = A @ B.T`
 **Shapes**:
 - A: `[M, K]` where M is variable, K = 4096
-- B: `[N, K]` where N = 4096, K = 4096  
+- B: `[N, K]` where N = 4096, K = 4096
 - C: `[M, N]` (output)
 - **Data type**: `float16` (FP16)
 
@@ -277,7 +277,7 @@ __global__ void gemm_kernel(
     // - WMMA/Tensor Core operations
     // - Coalesced memory access
     // - Proper synchronization
-    
+
     // C = A @ B.T
     // A is [M, 4096], B is [4096, 4096], C is [M, 4096]
 }
@@ -290,11 +290,11 @@ void gemm_n4096_k4096_launch(
     cudaStream_t stream
 ) {
     if (M <= 0) return;
-    
+
     dim3 block(256);
     dim3 grid((GEMM_N_CONST + BLOCK_N - 1) / BLOCK_N,
               (M + BLOCK_M - 1) / BLOCK_M);
-    
+
     gemm_kernel<<<grid, block, 0, stream>>>(A, B, C, M);
 }
 ```
@@ -313,19 +313,19 @@ void run(tvm::ffi::TensorView A, tvm::ffi::TensorView B, tvm::ffi::TensorView C)
     TVM_FFI_ICHECK_EQ(A.ndim(), 2) << "A must be 2D";
     TVM_FFI_ICHECK_EQ(B.ndim(), 2) << "B must be 2D";
     TVM_FFI_ICHECK_EQ(C.ndim(), 2) << "C must be 2D";
-    
+
     // Get dimensions
     int64_t M = A.size(0);
     int64_t K = A.size(1);
     int64_t N = B.size(0);
-    
+
     // Check shapes
     TVM_FFI_ICHECK_EQ(K, 4096) << "A.shape[1] must be 4096 (K)";
     TVM_FFI_ICHECK_EQ(N, 4096) << "B.shape[0] must be 4096 (N)";
     TVM_FFI_ICHECK_EQ(B.size(1), 4096) << "B.shape[1] must be 4096 (K)";
     TVM_FFI_ICHECK_EQ(C.size(0), M) << "C.shape[0] must match A.shape[0] (M)";
     TVM_FFI_ICHECK_EQ(C.size(1), N) << "C.shape[1] must be 4096 (N)";
-    
+
     // Check data types (float16)
     TVM_FFI_ICHECK_EQ(A.dtype().code, kDLFloat) << "A must be float type";
     TVM_FFI_ICHECK_EQ(A.dtype().bits, 16) << "A must be float16";
@@ -333,22 +333,22 @@ void run(tvm::ffi::TensorView A, tvm::ffi::TensorView B, tvm::ffi::TensorView C)
     TVM_FFI_ICHECK_EQ(B.dtype().bits, 16) << "B must be float16";
     TVM_FFI_ICHECK_EQ(C.dtype().code, kDLFloat) << "C must be float type";
     TVM_FFI_ICHECK_EQ(C.dtype().bits, 16) << "C must be float16";
-    
+
     // Check device (must be CUDA)
     TVM_FFI_ICHECK_EQ(A.device().device_type, kDLCUDA) << "A must be on CUDA";
     TVM_FFI_ICHECK_EQ(B.device().device_type, kDLCUDA) << "B must be on CUDA";
     TVM_FFI_ICHECK_EQ(C.device().device_type, kDLCUDA) << "C must be on CUDA";
-    
+
     // Get data pointers
     const __half* A_data = static_cast<const __half*>(A.data_ptr());
     const __half* B_data = static_cast<const __half*>(B.data_ptr());
     __half* C_data = static_cast<__half*>(C.data_ptr());
-    
+
     // Get CUDA stream from TVM FFI environment
     DLDevice dev = A.device();
     cudaStream_t stream = static_cast<cudaStream_t>(
         TVMFFIEnvGetStream(dev.device_type, dev.device_id));
-    
+
     // Launch kernel
     gemm_n4096_k4096_launch(A_data, B_data, C_data, static_cast<int>(M), stream);
 }
@@ -416,4 +416,3 @@ This agent.md provides complete instructions for generating a CUDA GEMM kernel i
 - **Validation**: Use `TVM_FFI_ICHECK_*` macros for input validation
 - **Stream management**: Get stream via `TVMFFIEnvGetStream()`
 - **Output**: Write complete JSON to `Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json`
-
diff --git a/examples/ffi/e2e_kernel.py b/examples/ffi/e2e_kernel.py
index fe3e719..5cab613 100644
--- a/examples/ffi/e2e_kernel.py
+++ b/examples/ffi/e2e_kernel.py
@@ -1,27 +1,30 @@
 """
 End-to-end example script, runs benchmark on agent generated gemm kernel, and applies it to a test input.
 """
+
 import torch
+
 from flashinfer_bench import (
     Benchmark,
     BenchmarkConfig,
     TraceSet,
     apply,
-    enable_apply,
     disable_apply,
+    enable_apply,
 )
 
+
 def main():
     traceset_path = "Example-FlashInfer-Trace"
-    
+
     print(f"Loading Example-FlashInfer-Trace")
     traceset = TraceSet.from_path(traceset_path)
-    
+
     # Run benchmark
     print("\nRunning Benchmark on agent generated gemm kernel")
     benchmark = Benchmark(traceset, BenchmarkConfig())
     result_traceset = benchmark.run_all()
-    
+
     print("\nBenchmark Complete")
     for def_name, traces in result_traceset.traces.items():
         print(f"\n{def_name}:")
@@ -30,30 +33,29 @@ def main():
             print(f"  Status: {trace.evaluation.status.value}")
             if trace.evaluation.performance:
                 print(f"  Speedup: {trace.evaluation.performance.speedup_factor:.2f}x")
-    
+
     # Apply kernel
     print("\nApplying generated kernel on test input")
-    
+
     with enable_apply(traceset_path):
         # Random test inputs for gemm_n4096_k4096
         # A: [M, K] = [1024, 4096]
         # B: [N, K] = [4096, 4096]
         # C = A @ B.T -> [1024, 4096]
-        
-        M, N, K = 1024, 4096, 4096        
+
+        M, N, K = 1024, 4096, 4096
         A = torch.randn(M, K, dtype=torch.float16, device="cuda")
         B = torch.randn(N, K, dtype=torch.float16, device="cuda")
-        
+
         def reference_gemm(A, B):
-            return torch.matmul(A, B.T)        
+            return torch.matmul(A, B.T)
+
         ref_sol = reference_gemm(A, B)
-        
+
         test_sol = apply(
-            "gemm_n4096_k4096",
-            runtime_kwargs={"A": A, "B": B},
-            fallback=reference_gemm
+            "gemm_n4096_k4096", runtime_kwargs={"A": A, "B": B}, fallback=reference_gemm
         )
-        
+
         if test_sol is not None:
             diff = torch.abs(ref_sol - test_sol).max().item()
             print(f"Max difference vs reference: {diff}")
@@ -62,8 +64,8 @@ def reference_gemm(A, B):
             else:
                 print("Vibe coded kernel differs from torch matmul")
         else:
-            print("No result returned (using fallback)")    
+            print("No result returned (using fallback)")
+
 
 if __name__ == "__main__":
     main()
-
diff --git a/flashinfer_bench/agents/load_inline.py b/tests/agent/load_inline.py
similarity index 91%
rename from flashinfer_bench/agents/load_inline.py
rename to tests/agent/load_inline.py
index fc1811c..677b6a4 100644
--- a/flashinfer_bench/agents/load_inline.py
+++ b/tests/agent/load_inline.py
@@ -1,6 +1,3 @@
-import re
-from pathlib import Path
-
 import torch
 import tvm_ffi.cpp
 from tvm_ffi import Module
@@ -40,6 +37,12 @@ def main():
     mod: Module = tvm_ffi.cpp.load_inline(name="add_one_cuda", cuda_sources=cuda_source)
     print("Compilation successful")
 
+    x = torch.randn(8).cuda()
+    y = torch.randn(8).cuda()
+    print(x)
+    mod.add_one_cuda(x, y)
+    print(y)
+
 
 if __name__ == "__main__":
     main()
diff --git a/flashinfer_bench/agents/test_existing_prompt_sol.py b/tests/agent/test_existing_prompt_sol.py
similarity index 95%
rename from flashinfer_bench/agents/test_existing_prompt_sol.py
rename to tests/agent/test_existing_prompt_sol.py
index d597f2f..12a7283 100644
--- a/flashinfer_bench/agents/test_existing_prompt_sol.py
+++ b/tests/agent/test_existing_prompt_sol.py
@@ -1,8 +1,3 @@
-"""
-Test script to re-test previously generated CUDA kernels from test_results directory.
-Reads .txt files containing generated code and runs the same test cases.
-"""
-
 import re
 from pathlib import Path
 
@@ -12,15 +7,9 @@
 
 
 def extract_code_from_file(file_path: Path) -> tuple[str, str]:
-    """Extract model name and generated code from a test result file.
-
-    Returns:
-        tuple: (model_name, code)
-    """
     with open(file_path, "r") as f:
         content = f.read()
 
-    # Extract model name
     model_match = re.search(r"Model: (.+?)\n", content)
     model_name = model_match.group(1) if model_match else file_path.stem
 
diff --git a/flashinfer_bench/agents/test_prompt.py b/tests/agent/test_prompt.py
similarity index 98%
rename from flashinfer_bench/agents/test_prompt.py
rename to tests/agent/test_prompt.py
index bea64ca..66cf356 100644
--- a/flashinfer_bench/agents/test_prompt.py
+++ b/tests/agent/test_prompt.py
@@ -180,7 +180,7 @@ def test_model(model_name: str, output_dir: Path):
             with open(output_file, "a") as f:
                 f.write("Test Results:\n")
                 f.write("=" * 80 + "\n")
-                f.write(f"Compilation: SUCCESS\n")
+                f.write("Compilation: SUCCESS\n")
                 f.write(f"Test 1 (small tensor): {'PASS' if test1_passed else 'FAIL'}\n")
                 f.write(f"Test 2 (large tensor): {'PASS' if test2_passed else 'FAIL'}\n")
 
@@ -201,7 +201,7 @@ def test_model(model_name: str, output_dir: Path):
             with open(output_file, "a") as f:
                 f.write("Test Results:\n")
                 f.write("=" * 80 + "\n")
-                f.write(f"Compilation: FAILED\n")
+                f.write("Compilation: FAILED\n")
                 f.write(f"Error: {str(e)}\n")
 
             return {
diff --git a/flashinfer_bench/agents/test_results/claude-opus-4-1-20250805.txt b/tests/agent/test_results/claude-opus-4-1-20250805.txt
similarity index 100%
rename from flashinfer_bench/agents/test_results/claude-opus-4-1-20250805.txt
rename to tests/agent/test_results/claude-opus-4-1-20250805.txt
diff --git a/flashinfer_bench/agents/test_results/gpt-5-2025-08-07.txt b/tests/agent/test_results/gpt-5-2025-08-07.txt
similarity index 100%
rename from flashinfer_bench/agents/test_results/gpt-5-2025-08-07.txt
rename to tests/agent/test_results/gpt-5-2025-08-07.txt
diff --git a/flashinfer_bench/agents/test_results/gpt-5-mini-2025-08-07.txt b/tests/agent/test_results/gpt-5-mini-2025-08-07.txt
similarity index 100%
rename from flashinfer_bench/agents/test_results/gpt-5-mini-2025-08-07.txt
rename to tests/agent/test_results/gpt-5-mini-2025-08-07.txt
diff --git a/flashinfer_bench/agents/test_results/o3.txt b/tests/agent/test_results/o3.txt
similarity index 100%
rename from flashinfer_bench/agents/test_results/o3.txt
rename to tests/agent/test_results/o3.txt
diff --git a/flashinfer_bench/agents/test_results/o4-mini-2025-04-16.txt b/tests/agent/test_results/o4-mini-2025-04-16.txt
similarity index 100%
rename from flashinfer_bench/agents/test_results/o4-mini-2025-04-16.txt
rename to tests/agent/test_results/o4-mini-2025-04-16.txt

From 9a0ce218ca4fe05abdb961d6ba3832d6a0465db5 Mon Sep 17 00:00:00 2001
From: zanderjiang <zander.jiang@gmail.com>
Date: Tue, 25 Nov 2025 20:35:02 -0500
Subject: [PATCH 06/10] remove testing results

---
 .../test_results/claude-opus-4-1-20250805.txt | 124 ---------------
 tests/agent/test_results/gpt-5-2025-08-07.txt | 111 --------------
 .../test_results/gpt-5-mini-2025-08-07.txt    | 129 ----------------
 tests/agent/test_results/o3.txt               | 141 ------------------
 .../agent/test_results/o4-mini-2025-04-16.txt | 122 ---------------
 5 files changed, 627 deletions(-)
 delete mode 100644 tests/agent/test_results/claude-opus-4-1-20250805.txt
 delete mode 100644 tests/agent/test_results/gpt-5-2025-08-07.txt
 delete mode 100644 tests/agent/test_results/gpt-5-mini-2025-08-07.txt
 delete mode 100644 tests/agent/test_results/o3.txt
 delete mode 100644 tests/agent/test_results/o4-mini-2025-04-16.txt

diff --git a/tests/agent/test_results/claude-opus-4-1-20250805.txt b/tests/agent/test_results/claude-opus-4-1-20250805.txt
deleted file mode 100644
index dcae1da..0000000
--- a/tests/agent/test_results/claude-opus-4-1-20250805.txt
+++ /dev/null
@@ -1,124 +0,0 @@
-Model: claude-opus-4-1-20250805
-================================================================================
-
-Generated Code:
-================================================================================
-// File: elementwise_add.cu
-#include <tvm/ffi/container/tensor.h>
-#include <tvm/ffi/extra/c_env_api.h>
-#include <tvm/ffi/function.h>
-#include <tvm/ffi/error.h>
-
-namespace elementwise_ops {
-
-__global__ void ElementwiseAddKernel(const float* a, const float* b, float* c, int64_t n) {
-  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    c[idx] = a[idx] + b[idx];
-  }
-}
-
-void ElementwiseAdd(tvm::ffi::TensorView a, tvm::ffi::TensorView b, tvm::ffi::TensorView c) {
-  // Validate that all tensors are on the same device
-  DLDevice dev_a = a.device();
-  DLDevice dev_b = b.device();
-  DLDevice dev_c = c.device();
-
-  if (dev_a.device_type != dev_b.device_type || dev_a.device_id != dev_b.device_id ||
-      dev_a.device_type != dev_c.device_type || dev_a.device_id != dev_c.device_id) {
-    TVM_FFI_THROW(ValueError) << "All tensors must be on the same device";
-  }
-
-  // Check that we're on CUDA device
-  if (dev_a.device_type != kDLCUDA) {
-    TVM_FFI_THROW(RuntimeError) << "This kernel only supports CUDA devices, got device type: "
-                                 << dev_a.device_type;
-  }
-
-  // Validate tensor dimensions - must be 1D
-  if (a.ndim() != 1) {
-    TVM_FFI_THROW(ValueError) << "Input tensor 'a' must be 1D, got " << a.ndim() << "D";
-  }
-  if (b.ndim() != 1) {
-    TVM_FFI_THROW(ValueError) << "Input tensor 'b' must be 1D, got " << b.ndim() << "D";
-  }
-  if (c.ndim() != 1) {
-    TVM_FFI_THROW(ValueError) << "Output tensor 'c' must be 1D, got " << c.ndim() << "D";
-  }
-
-  // Validate tensor shapes match
-  int64_t n = a.size(0);
-  if (b.size(0) != n) {
-    TVM_FFI_THROW(ValueError) << "Input tensors must have the same size. "
-                               << "a.size=" << n << ", b.size=" << b.size(0);
-  }
-  if (c.size(0) != n) {
-    TVM_FFI_THROW(ValueError) << "Output tensor size must match input size. "
-                               << "Expected " << n << ", got " << c.size(0);
-  }
-
-  // Validate data types - must be float32
-  DLDataType dtype_a = a.dtype();
-  DLDataType dtype_b = b.dtype();
-  DLDataType dtype_c = c.dtype();
-
-  if (dtype_a.code != kDLFloat || dtype_a.bits != 32 || dtype_a.lanes != 1) {
-    TVM_FFI_THROW(TypeError) << "Input tensor 'a' must be float32";
-  }
-  if (dtype_b.code != kDLFloat || dtype_b.bits != 32 || dtype_b.lanes != 1) {
-    TVM_FFI_THROW(TypeError) << "Input tensor 'b' must be float32";
-  }
-  if (dtype_c.code != kDLFloat || dtype_c.bits != 32 || dtype_c.lanes != 1) {
-    TVM_FFI_THROW(TypeError) << "Output tensor 'c' must be float32";
-  }
-
-  // Validate tensors are contiguous
-  if (!a.IsContiguous()) {
-    TVM_FFI_THROW(ValueError) << "Input tensor 'a' must be contiguous";
-  }
-  if (!b.IsContiguous()) {
-    TVM_FFI_THROW(ValueError) << "Input tensor 'b' must be contiguous";
-  }
-  if (!c.IsContiguous()) {
-    TVM_FFI_THROW(ValueError) << "Output tensor 'c' must be contiguous";
-  }
-
-  // Get data pointers
-  const float* a_data = static_cast<const float*>(a.data_ptr());
-  const float* b_data = static_cast<const float*>(b.data_ptr());
-  float* c_data = static_cast<float*>(c.data_ptr());
-
-  // Handle empty tensor case
-  if (n == 0) {
-    return;
-  }
-
-  // Get CUDA stream from environment
-  cudaStream_t stream = static_cast<cudaStream_t>(
-      TVMFFIEnvGetStream(dev_a.device_type, dev_a.device_id));
-
-  // Configure kernel launch parameters
-  const int64_t threads_per_block = 256;
-  int64_t num_blocks = (n + threads_per_block - 1) / threads_per_block;
-
-  // Launch kernel
-  ElementwiseAddKernel<<<num_blocks, threads_per_block, 0, stream>>>(
-      a_data, b_data, c_data, n);
-
-  // Check for kernel launch errors
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    TVM_FFI_THROW(RuntimeError) << "CUDA kernel launch failed: " << cudaGetErrorString(err);
-  }
-}
-
-// Export the function with name "elementwise_add"
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, ElementwiseAdd);
-
-}  // namespace elementwise_ops
-
-Test Results:
-================================================================================
-Compilation: SUCCESS
-Test 1 (small tensor): PASS
-Test 2 (large tensor): PASS
diff --git a/tests/agent/test_results/gpt-5-2025-08-07.txt b/tests/agent/test_results/gpt-5-2025-08-07.txt
deleted file mode 100644
index 39a6d59..0000000
--- a/tests/agent/test_results/gpt-5-2025-08-07.txt
+++ /dev/null
@@ -1,111 +0,0 @@
-Model: gpt-5-2025-08-07
-================================================================================
-
-Generated Code:
-================================================================================
-// File: elementwise_add.cu
-#include <cuda_runtime.h>
-
-#include <tvm/ffi/container/tensor.h>   // TensorView: tensor arguments
-#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
-#include <tvm/ffi/error.h>              // TVM_FFI_ICHECK, TVM_FFI_THROW
-#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream
-
-namespace elementwise {
-
-__global__ void ElementwiseAddKernel(const float* __restrict__ a,
-                                     const float* __restrict__ b,
-                                     float* __restrict__ c,
-                                     int64_t n) {
-  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    c[idx] = a[idx] + b[idx];
-  }
-}
-
-void ElementwiseAdd(tvm::ffi::TensorView a,
-                    tvm::ffi::TensorView b,
-                    tvm::ffi::TensorView c) {
-  // User input validation
-  if (a.ndim() != 1 || b.ndim() != 1 || c.ndim() != 1) {
-    TVM_FFI_THROW(ValueError) << "All tensors must be 1D, got ndim: a=" << a.ndim()
-                              << ", b=" << b.ndim() << ", c=" << c.ndim();
-  }
-
-  // DType checks: float32
-  auto da = a.dtype();
-  auto db = b.dtype();
-  auto dc = c.dtype();
-  if (!(da.code == kDLFloat && da.bits == 32 && da.lanes == 1)) {
-    TVM_FFI_THROW(TypeError) << "Tensor a must be float32";
-  }
-  if (!(db.code == kDLFloat && db.bits == 32 && db.lanes == 1)) {
-    TVM_FFI_THROW(TypeError) << "Tensor b must be float32";
-  }
-  if (!(dc.code == kDLFloat && dc.bits == 32 && dc.lanes == 1)) {
-    TVM_FFI_THROW(TypeError) << "Tensor c must be float32";
-  }
-
-  // Shape checks
-  int64_t na = a.size(0);
-  int64_t nb = b.size(0);
-  int64_t nc = c.size(0);
-  if (!(na == nb && nb == nc)) {
-    TVM_FFI_THROW(ValueError) << "Shape mismatch: a.size(0)=" << na
-                              << ", b.size(0)=" << nb << ", c.size(0)=" << nc;
-  }
-
-  // Device checks: must be CUDA and all on same device
-  DLDevice da_dev = a.device();
-  DLDevice db_dev = b.device();
-  DLDevice dc_dev = c.device();
-  if (!(da_dev.device_type == kDLCUDA && db_dev.device_type == kDLCUDA &&
-        dc_dev.device_type == kDLCUDA)) {
-    TVM_FFI_THROW(RuntimeError) << "elementwise_add supports only CUDA tensors";
-  }
-  if (!(da_dev.device_id == db_dev.device_id && db_dev.device_id == dc_dev.device_id)) {
-    TVM_FFI_THROW(ValueError) << "All tensors must be on the same CUDA device";
-  }
-
-  // Memory layout: require contiguous for simple linear indexing
-  if (!a.IsContiguous() || !b.IsContiguous() || !c.IsContiguous()) {
-    TVM_FFI_THROW(ValueError) << "All tensors must be contiguous";
-  }
-
-  // Get data pointers
-  const float* a_ptr = static_cast<const float*>(a.data_ptr());
-  const float* b_ptr = static_cast<const float*>(b.data_ptr());
-  float* c_ptr = static_cast<float*>(c.data_ptr());
-
-  int64_t n = na;
-  if (n == 0) return;
-
-  // Get CUDA stream from TVM environment
-  cudaStream_t stream = static_cast<cudaStream_t>(
-      TVMFFIEnvGetStream(dc_dev.device_type, dc_dev.device_id));
-
-  // Launch configuration
-  const int threads = 256;
-  unsigned int blocks = static_cast<unsigned int>((n + threads - 1) / threads);
-  if (blocks == 0) return;
-
-  // Launch kernel
-  ElementwiseAddKernel<<<blocks, threads, 0, stream>>>(a_ptr, b_ptr, c_ptr, n);
-
-  // Check for launch errors
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    TVM_FFI_THROW(RuntimeError) << "CUDA kernel launch failed: " << cudaGetErrorString(err);
-  }
-}
-
-}  // namespace elementwise
-
-// Export the function with name "elementwise_add"
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, elementwise::ElementwiseAdd)
-
-Test Results:
-================================================================================
-Compilation: SUCCESS
-Test 1 (small tensor): PASS
-Test 2 (large tensor): PASS
diff --git a/tests/agent/test_results/gpt-5-mini-2025-08-07.txt b/tests/agent/test_results/gpt-5-mini-2025-08-07.txt
deleted file mode 100644
index 24dd09b..0000000
--- a/tests/agent/test_results/gpt-5-mini-2025-08-07.txt
+++ /dev/null
@@ -1,129 +0,0 @@
-Model: gpt-5-mini-2025-08-07
-================================================================================
-
-Generated Code:
-================================================================================
-// File: elementwise_add.cu
-#include <cstdint>
-#include <cstddef>
-#include <cuda_runtime.h>
-
-#include <tvm/ffi/container/tensor.h>   // TensorView: tensor arguments
-#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream, TVMFFIEnvTensorAlloc
-#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
-#include <tvm/ffi/error.h>              // TVM_FFI_ICHECK, TVM_FFI_THROW
-
-using tvm::ffi::TensorView;
-
-namespace tvm_extra_kernels {
-
-__global__ void ElementwiseAddKernel(const float* a, const float* b, float* c, int64_t n) {
-  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    c[idx] = a[idx] + b[idx];
-  }
-}
-
-void ElementwiseAdd(TensorView a, TensorView b, TensorView c) {
-  // Basic shape / type validation (user-facing errors)
-  if (a.ndim() != 1) {
-    TVM_FFI_THROW(ValueError) << "elementwise_add: input tensor 'a' must be 1D, got "
-                              << a.ndim() << "D";
-  }
-  if (b.ndim() != 1) {
-    TVM_FFI_THROW(ValueError) << "elementwise_add: input tensor 'b' must be 1D, got "
-                              << b.ndim() << "D";
-  }
-  if (c.ndim() != 1) {
-    TVM_FFI_THROW(ValueError) << "elementwise_add: output tensor 'c' must be 1D, got "
-                              << c.ndim() << "D";
-  }
-
-  // dtype validation: float32
-  auto da = a.dtype();
-  auto db = b.dtype();
-  auto dc = c.dtype();
-  if (!(da.code == kDLFloat && da.bits == 32 && da.lanes == 1)) {
-    TVM_FFI_THROW(TypeError) << "elementwise_add: tensor 'a' must be float32";
-  }
-  if (!(db.code == kDLFloat && db.bits == 32 && db.lanes == 1)) {
-    TVM_FFI_THROW(TypeError) << "elementwise_add: tensor 'b' must be float32";
-  }
-  if (!(dc.code == kDLFloat && dc.bits == 32 && dc.lanes == 1)) {
-    TVM_FFI_THROW(TypeError) << "elementwise_add: tensor 'c' must be float32";
-  }
-
-  // shape / size validation
-  int64_t n = a.size(0);
-  if (b.size(0) != n) {
-    TVM_FFI_THROW(ValueError) << "elementwise_add: size mismatch between 'a' and 'b' ("
-                              << n << " vs " << b.size(0) << ")";
-  }
-  if (c.size(0) != n) {
-    TVM_FFI_THROW(ValueError) << "elementwise_add: output 'c' size must match inputs (expected "
-                              << n << ", got " << c.size(0) << ")";
-  }
-
-  // Ensure tensors are contiguous for simple indexing
-  if (!a.IsContiguous()) {
-    TVM_FFI_THROW(ValueError) << "elementwise_add: input 'a' must be contiguous";
-  }
-  if (!b.IsContiguous()) {
-    TVM_FFI_THROW(ValueError) << "elementwise_add: input 'b' must be contiguous";
-  }
-  if (!c.IsContiguous()) {
-    TVM_FFI_THROW(ValueError) << "elementwise_add: output 'c' must be contiguous";
-  }
-
-  // Ensure all tensors are on the same device
-  DLDevice da_dev = a.device();
-  DLDevice db_dev = b.device();
-  DLDevice dc_dev = c.device();
-  if (!(da_dev.device_type == db_dev.device_type && da_dev.device_id == db_dev.device_id &&
-        da_dev.device_type == dc_dev.device_type && da_dev.device_id == dc_dev.device_id)) {
-    TVM_FFI_THROW(ValueError) << "elementwise_add: tensors must be on the same device";
-  }
-
-  // Pointers to data
-  float* a_ptr = static_cast<float*>(a.data_ptr());
-  float* b_ptr = static_cast<float*>(b.data_ptr());
-  float* c_ptr = static_cast<float*>(c.data_ptr());
-
-  // Dispatch by device type
-  if (da_dev.device_type == kDLCUDA) {
-    // GPU path: launch CUDA kernel on environment-managed stream
-    TVMFFIStreamHandle raw_stream = TVMFFIEnvGetStream(da_dev.device_type, da_dev.device_id);
-    cudaStream_t stream = static_cast<cudaStream_t>(raw_stream);
-
-    const int threads = 256;
-    const int64_t blocks = (n + threads - 1) / threads;
-    ElementwiseAddKernel<<<static_cast<int>(blocks), threads, 0, stream>>>(a_ptr, b_ptr, c_ptr, n);
-
-    // Check kernel launch error
-    cudaError_t err = cudaGetLastError();
-    if (err != cudaSuccess) {
-      TVM_FFI_THROW(RuntimeError) << "elementwise_add: CUDA kernel launch failed: "
-                                  << cudaGetErrorString(err);
-    }
-    // Note: we do not synchronize here; environment may manage synchronization.
-  } else if (da_dev.device_type == kDLCPU || da_dev.device_type == kDLCUDAHost) {
-    // CPU path: simple element-wise loop
-    for (int64_t i = 0; i < n; ++i) {
-      c_ptr[i] = a_ptr[i] + b_ptr[i];
-    }
-  } else {
-    TVM_FFI_THROW(RuntimeError) << "elementwise_add: unsupported device type "
-                                << da_dev.device_type;
-  }
-}
-
-// Export the function with name "elementwise_add"
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, ElementwiseAdd);
-
-}  // namespace tvm_extra_kernels
-
-Test Results:
-================================================================================
-Compilation: SUCCESS
-Test 1 (small tensor): PASS
-Test 2 (large tensor): PASS
diff --git a/tests/agent/test_results/o3.txt b/tests/agent/test_results/o3.txt
deleted file mode 100644
index 212fd5a..0000000
--- a/tests/agent/test_results/o3.txt
+++ /dev/null
@@ -1,141 +0,0 @@
-Model: o3
-================================================================================
-
-Generated Code:
-================================================================================
-// File: elementwise_add.cu
-//
-// Simple element-wise addition kernel exported through TVM-FFI
-//   c[i] = a[i] + b[i]
-//
-// Build example (CUDA >= 11):
-// nvcc -std=c++17 -O2 --shared -Xcompiler -fPIC elementwise_add.cu -o libelementwise_add.so
-//
-// ------------------------------------------------------------------------------
-
-#include <cuda_runtime.h>
-
-#include <tvm/ffi/container/tensor.h>   // TensorView
-#include <tvm/ffi/extra/c_env_api.h>    // TVMFFIEnvGetStream
-#include <tvm/ffi/function.h>           // TVM_FFI_DLL_EXPORT_TYPED_FUNC
-#include <tvm/ffi/error.h>              // ICHECK / THROW helpers
-
-namespace tvm_add {
-
-/*!
- * \brief CUDA kernel: c[i] = a[i] + b[i]
- */
-__global__ void ElementwiseAddKernel(const float* a,
-                                     const float* b,
-                                     float* c,
-                                     int64_t n) {
-  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    c[idx] = a[idx] + b[idx];
-  }
-}
-
-/*!
- * \brief Host wrapper visible to TVM-FFI.
- *
- * Signature:  void(TensorView a, TensorView b, TensorView c)
- */
-void ElementwiseAdd(tvm::ffi::TensorView a,
-                    tvm::ffi::TensorView b,
-                    tvm::ffi::TensorView c) {
-  /* ---------------------- Input validation ---------------------- */
-  // 1-D check
-  if (a.ndim() != 1 || b.ndim() != 1 || c.ndim() != 1) {
-    TVM_FFI_THROW(ValueError)
-        << "All tensors must be 1-D. Got ndim = "
-        << a.ndim() << ", " << b.ndim() << ", " << c.ndim();
-  }
-
-  // dtype check (float32 only)
-  auto check_f32 = [](const tvm::ffi::TensorView& t, const char* name) {
-    DLDataType dt = t.dtype();
-    if (dt.code != kDLFloat || dt.bits != 32 || dt.lanes != 1) {
-      TVM_FFI_THROW(TypeError)
-          << "Tensor " << name << " must be float32. "
-          << "Got (code=" << static_cast<int>(dt.code)
-          << ", bits=" << dt.bits
-          << ", lanes=" << dt.lanes << ")";
-    }
-  };
-  check_f32(a, "a");
-  check_f32(b, "b");
-  check_f32(c, "c");
-
-  // size / shape check
-  int64_t n = a.size(0);
-  if (b.size(0) != n || c.size(0) != n) {
-    TVM_FFI_THROW(ValueError)
-        << "Mismatched tensor sizes: "
-        << "a=" << a.size(0) << ", "
-        << "b=" << b.size(0) << ", "
-        << "c=" << c.size(0);
-  }
-
-  // device check (all on same CUDA device)
-  DLDevice dev_a = a.device();
-  DLDevice dev_b = b.device();
-  DLDevice dev_c = c.device();
-  if (dev_a.device_type != kDLCUDA || dev_b.device_type != kDLCUDA ||
-      dev_c.device_type != kDLCUDA) {
-    TVM_FFI_THROW(RuntimeError)
-        << "All tensors must reside on CUDA device. "
-        << "Device types: a=" << dev_a.device_type
-        << ", b=" << dev_b.device_type
-        << ", c=" << dev_c.device_type;
-  }
-  if (dev_a.device_id != dev_b.device_id ||
-      dev_a.device_id != dev_c.device_id) {
-    TVM_FFI_THROW(RuntimeError)
-        << "All tensors must be on the same CUDA device. "
-        << "Device IDs: a=" << dev_a.device_id
-        << ", b=" << dev_b.device_id
-        << ", c=" << dev_c.device_id;
-  }
-
-  // Optional: require contiguous memory for simplest access pattern
-  if (!a.IsContiguous() || !b.IsContiguous() || !c.IsContiguous()) {
-    TVM_FFI_THROW(ValueError) << "All tensors must be contiguous.";
-  }
-
-  /* -------------------- Launch CUDA kernel ---------------------- */
-  if (n == 0) return;  // nothing to do
-
-  const float* a_ptr = static_cast<const float*>(a.data_ptr());
-  const float* b_ptr = static_cast<const float*>(b.data_ptr());
-  float*       c_ptr = static_cast<float*>(c.data_ptr());
-
-  const int threads = 256;
-  const int blocks  = static_cast<int>((n + threads - 1) / threads);
-
-  // Get stream managed by runtime
-  cudaStream_t stream = static_cast<cudaStream_t>(
-      TVMFFIEnvGetStream(dev_a.device_type, dev_a.device_id));
-
-  ElementwiseAddKernel<<<blocks, threads, 0, stream>>>(a_ptr, b_ptr, c_ptr, n);
-
-#ifndef NDEBUG
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess) {
-    TVM_FFI_THROW(RuntimeError)
-        << "CUDA kernel launch failed: " << cudaGetErrorString(err);
-  }
-#endif
-}
-
-}  // namespace tvm_add
-
-// -----------------------------------------------------------------------------
-// TVM-FFI export
-// -----------------------------------------------------------------------------
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, tvm_add::ElementwiseAdd);
-
-Test Results:
-================================================================================
-Compilation: SUCCESS
-Test 1 (small tensor): PASS
-Test 2 (large tensor): PASS
diff --git a/tests/agent/test_results/o4-mini-2025-04-16.txt b/tests/agent/test_results/o4-mini-2025-04-16.txt
deleted file mode 100644
index 0f67d4b..0000000
--- a/tests/agent/test_results/o4-mini-2025-04-16.txt
+++ /dev/null
@@ -1,122 +0,0 @@
-Model: o4-mini-2025-04-16
-================================================================================
-
-Generated Code:
-================================================================================
-// File: elementwise_add.cu
-
-#include <tvm/ffi/container/tensor.h>    // TensorView
-#include <tvm/ffi/extra/c_env_api.h>     // TVMFFIEnvGetStream
-#include <tvm/ffi/function.h>            // TVM_FFI_DLL_EXPORT_TYPED_FUNC
-#include <tvm/ffi/error.h>               // TVM_FFI_ICHECK, TVM_FFI_THROW
-#include <cuda_runtime.h>                // cudaStream_t
-
-// CUDA kernel: c[i] = a[i] + b[i]
-__global__ void ElementwiseAddKernel(const float* a,
-                                     const float* b,
-                                     float*       c,
-                                     int64_t      n) {
-  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x
-                + threadIdx.x;
-  if (idx < n) {
-    c[idx] = a[idx] + b[idx];
-  }
-}
-
-// Host wrapper exposed via TVM FFI
-void ElementwiseAdd(tvm::ffi::TensorView a,
-                    tvm::ffi::TensorView b,
-                    tvm::ffi::TensorView c) {
-  // 1) Dimensionality checks
-  if (a.ndim() != 1) {
-    TVM_FFI_THROW(ValueError)
-        << "elementwise_add: expected 1D tensor for `a`, got "
-        << a.ndim() << "D";
-  }
-  if (b.ndim() != 1) {
-    TVM_FFI_THROW(ValueError)
-        << "elementwise_add: expected 1D tensor for `b`, got "
-        << b.ndim() << "D";
-  }
-  if (c.ndim() != 1) {
-    TVM_FFI_THROW(ValueError)
-        << "elementwise_add: expected 1D tensor for `c`, got "
-        << c.ndim() << "D";
-  }
-
-  int64_t n = a.size(0);
-  if (b.size(0) != n) {
-    TVM_FFI_THROW(ValueError)
-        << "elementwise_add: size mismatch — `a` has length "
-        << n << " but `b` has length " << b.size(0);
-  }
-  if (c.size(0) != n) {
-    TVM_FFI_THROW(ValueError)
-        << "elementwise_add: size mismatch — `a` has length "
-        << n << " but `c` has length " << c.size(0);
-  }
-
-  // 2) DType checks (float32)
-  auto da = a.dtype();
-  if (da.code != kDLFloat || da.bits != 32 || da.lanes != 1) {
-    TVM_FFI_THROW(TypeError)
-        << "elementwise_add: expected float32 dtype for `a`";
-  }
-  auto db = b.dtype();
-  if (db.code != kDLFloat || db.bits != 32 || db.lanes != 1) {
-    TVM_FFI_THROW(TypeError)
-        << "elementwise_add: expected float32 dtype for `b`";
-  }
-  auto dc = c.dtype();
-  if (dc.code != kDLFloat || dc.bits != 32 || dc.lanes != 1) {
-    TVM_FFI_THROW(TypeError)
-        << "elementwise_add: expected float32 dtype for `c`";
-  }
-
-  // 3) (Optional) Contiguity checks
-  TVM_FFI_ICHECK(a.IsContiguous()) << "elementwise_add: `a` must be contiguous";
-  TVM_FFI_ICHECK(b.IsContiguous()) << "elementwise_add: `b` must be contiguous";
-  TVM_FFI_ICHECK(c.IsContiguous()) << "elementwise_add: `c` must be contiguous";
-
-  // 4) Device checks
-  DLDevice dev = a.device();
-  if (dev.device_type != kDLCUDA) {
-    TVM_FFI_THROW(RuntimeError)
-        << "elementwise_add: only CUDA device supported";
-  }
-  if (b.device().device_type != dev.device_type ||
-      b.device().device_id   != dev.device_id) {
-    TVM_FFI_THROW(ValueError)
-        << "elementwise_add: `b` must be on the same CUDA device as `a`";
-  }
-  if (c.device().device_type != dev.device_type ||
-      c.device().device_id   != dev.device_id) {
-    TVM_FFI_THROW(ValueError)
-        << "elementwise_add: `c` must be on the same CUDA device as `a`";
-  }
-
-  // 5) Pointers
-  const float* a_ptr = static_cast<const float*>(a.data_ptr());
-  const float* b_ptr = static_cast<const float*>(b.data_ptr());
-  float*       c_ptr = static_cast<float*>(c.data_ptr());
-
-  // 6) Get CUDA stream from TVM runtime
-  cudaStream_t stream =
-      static_cast<cudaStream_t>(TVMFFIEnvGetStream(dev.device_type,
-                                                  dev.device_id));
-
-  // 7) Launch kernel
-  constexpr int THREADS = 256;
-  int blocks = static_cast<int>((n + THREADS - 1) / THREADS);
-  ElementwiseAddKernel<<<blocks, THREADS, 0, stream>>>(
-      a_ptr, b_ptr, c_ptr, n);
-}
-
-// Export for TVM FFI under the name "elementwise_add"
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(elementwise_add, ElementwiseAdd);
-
-Test Results:
-================================================================================
-Compilation: SUCCESS
-Test 1 (small tensor): PASS
-Test 2 (large tensor): PASS

From 19a32bbc6076f192372cbcbae030f82d3d2f9b5b Mon Sep 17 00:00:00 2001
From: zanderjiang <zander.jiang@gmail.com>
Date: Fri, 28 Nov 2025 02:44:38 -0500
Subject: [PATCH 07/10] updated agent vibecode ffi example

---
 .../solutions/agent_vibecode_gemm.json        |   0
 .../solutions/example_agent_solution.json     |  24 +++
 examples/ffi/Makefile                         | 104 +++++++++++
 examples/ffi/README.md                        |  69 +++++++
 examples/ffi/{agent.md => agent_vibecode.md}  |   2 +-
 examples/ffi/cpp_example.cc                   | 173 ++++++++++++++++++
 examples/ffi/distribute_kernel.py             |  39 ++++
 examples/ffi/e2e_kernel.py                    |  71 -------
 examples/ffi/jax_example.py                   |  62 +++++++
 examples/ffi/pytorch_example.py               |  49 +++++
 10 files changed, 521 insertions(+), 72 deletions(-)
 delete mode 100644 examples/ffi/Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json
 create mode 100644 examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json
 create mode 100644 examples/ffi/Makefile
 create mode 100644 examples/ffi/README.md
 rename examples/ffi/{agent.md => agent_vibecode.md} (99%)
 create mode 100644 examples/ffi/cpp_example.cc
 create mode 100644 examples/ffi/distribute_kernel.py
 delete mode 100644 examples/ffi/e2e_kernel.py
 create mode 100644 examples/ffi/jax_example.py
 create mode 100644 examples/ffi/pytorch_example.py

diff --git a/examples/ffi/Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json b/examples/ffi/Example-FlashInfer-Trace/solutions/agent_vibecode_gemm.json
deleted file mode 100644
index e69de29..0000000
diff --git a/examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json b/examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json
new file mode 100644
index 0000000..01bc4b6
--- /dev/null
+++ b/examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json
@@ -0,0 +1,24 @@
+{
+  "name": "example_agent_solution",
+  "definition": "gemm_n4096_k4096",
+  "description": "example agent vibecoded kernel generated by  gpt-5-2025-08-07 (reasoning effort: high)",
+  "author": "gpt-5-2025-08-07",
+  "spec": {
+    "language": "cuda",
+    "target_hardware": [
+      "B200"
+    ],
+    "entry_point": "kernel.cu::gemm_n_4096_k_4096",
+    "dependencies": []
+  },
+  "sources": [
+    {
+      "path": "kernel.h",
+      "content": "#ifndef GEMM_N_4096_K_4096_KERNEL_H\n#define GEMM_N_4096_K_4096_KERNEL_H\n\n#include <cuda_runtime.h>\n#include <cuda_fp16.h>\n#include <cstdint>\n\n// Constants fixed by specification\nconstexpr int GEMM_N_CONST = 4096;\nconstexpr int GEMM_K_CONST = 4096;\n\n// Utility: ceiling division\ninline int ceil_div(int a, int b) { return (a + b - 1) / b; }\n\n#endif // GEMM_N_4096_K_4096_KERNEL_H"
+    },
+    {
+      "path": "kernel.cu",
+      "content": "#include \"kernel.h\"\n#include <tvm/ffi/container/tensor.h>\n#include <tvm/ffi/extra/c_env_api.h>\n#include <tvm/ffi/function.h>\n#include <tvm/ffi/error.h>\n#include <mma.h>\n#include <cstdio>\n\nusing namespace nvcuda;\n\n// Error check macro\n#ifndef CUDA_CHECK\n#define CUDA_CHECK(expr)                                                     \\\n  do {                                                                       \\\n    cudaError_t _err = (expr);                                               \\\n    if (_err != cudaSuccess) {                                               \\\n      fprintf(stderr, \"CUDA Error %s at %s:%d: %s\\n\", #expr, __FILE__, __LINE__, cudaGetErrorString(_err)); \\\n      abort();                                                               \\\n    }                                                                        \\\n  } while (0)\n#endif\n\n// Kernel configuration tuned for B200\n// - Block tile: 128 x 256 (M x N)\n// - K tile: 64\n// - 8 warps per block (256 threads), each warp computes a 64x64 sub-tile via WMMA (4x4 tiles of 16x16)\n// - Accumulate in FP32, convert to FP16 on store\nconstexpr int BLOCK_M = 128;\nconstexpr int BLOCK_N = 256;\nconstexpr int BLOCK_K = 64;\n\nconstexpr int WARPS_PER_BLOCK = 8;\nconstexpr int THREADS_PER_BLOCK = WARPS_PER_BLOCK * 32;\n\nconstexpr int WARP_TILE_M = 64;\nconstexpr int WARP_TILE_N = 64;\n\nconstexpr int WMMA_M = 16;\nconstexpr int WMMA_N = 16;\nconstexpr int WMMA_K = 16;\n\n// Padding to avoid shared memory bank conflicts (in elements)\nconstexpr int SKEW_HALF  = 8; // for half elements\nconstexpr int SKEW_FLOAT = 8; // for float elements\n\n// Align pointer p up to 'alignment' bytes\n__device__ __forceinline__ char* align_up(char* p, size_t alignment) {\n  uintptr_t ip = reinterpret_cast<uintptr_t>(p);\n  ip = (ip + (alignment - 1)) & ~(alignment - 1);\n  return reinterpret_cast<char*>(ip);\n}\n\n__global__ __launch_bounds__(THREADS_PER_BLOCK, 2)\nvoid gemm_n_4096_k_4096_kernel(const __half* __restrict__ A,\n                               const __half* __restrict__ B,\n                               __half* __restrict__ C,\n                               int M) {\n  // Shared memory layout (dynamically allocated):\n  // [A_smem (half) | B_smem (half) | C_smem (float)]\n  extern __shared__ char smem_raw[];\n  char* smem_ptr = smem_raw;\n\n  // Compute sizes\n  const int A_smem_elems = BLOCK_M * (BLOCK_K + SKEW_HALF);\n  const int B_smem_elems = BLOCK_N * (BLOCK_K + SKEW_HALF);\n  const int C_smem_elems = BLOCK_M * (BLOCK_N + SKEW_FLOAT);\n\n  const size_t A_smem_bytes = A_smem_elems * sizeof(__half);\n  const size_t B_smem_bytes = B_smem_elems * sizeof(__half);\n  const size_t C_smem_bytes = C_smem_elems * sizeof(float);\n\n  __half* A_smem = reinterpret_cast<__half*>(smem_ptr);\n  smem_ptr = align_up(smem_ptr + A_smem_bytes, 16);\n  __half* B_smem = reinterpret_cast<__half*>(smem_ptr);\n  smem_ptr = align_up(smem_ptr + B_smem_bytes, 16);\n  float*  C_smem = reinterpret_cast<float*>(smem_ptr);\n\n  // Block coordinates\n  const int block_m = blockIdx.y; // along M\n  const int block_n = blockIdx.x; // along N\n  const int m0 = block_m * BLOCK_M;\n  const int n0 = block_n * BLOCK_N;\n\n  // Early exit if out of range (shouldn't happen due to gridDim.y, but guard anyway)\n  if (m0 >= M) return;\n\n  // Global strides (row-major)\n  const int lda = GEMM_K_CONST; // 4096\n  const int ldb = GEMM_K_CONST; // 4096\n  const int ldc = GEMM_N_CONST; // 4096\n\n  // Thread identifiers\n  const int tid     = threadIdx.x;\n  const int warp_id = tid / 32;\n  const int lane_id = tid % 32;\n\n  // Warp tile coordinates within the block\n  const int WARPS_N = BLOCK_N / WARP_TILE_N; // 256/64 = 4\n  const int warp_m_tile = warp_id / WARPS_N; // 0..1\n  const int warp_n_tile = warp_id % WARPS_N; // 0..3\n\n  // Initialize accumulators\n  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag[WARP_TILE_M / WMMA_M][WARP_TILE_N / WMMA_N];\n#pragma unroll\n  for (int i = 0; i < (WARP_TILE_M / WMMA_M); ++i) {\n#pragma unroll\n    for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n      wmma::fill_fragment(c_frag[i][j], 0.0f);\n    }\n  }\n\n  // Loop over K dimension in tiles of BLOCK_K\n  for (int k0 = 0; k0 < GEMM_K_CONST; k0 += BLOCK_K) {\n\n    // Load A tile into shared memory: [BLOCK_M x BLOCK_K] with stride (BLOCK_K + SKEW_HALF)\n    {\n      const int total_vec = (BLOCK_M * BLOCK_K) / 8; // 1024\n#pragma unroll\n      for (int v = 0; v < (total_vec / THREADS_PER_BLOCK); ++v) {\n        const int vec_idx = tid + v * THREADS_PER_BLOCK;\n        const int elem_idx = vec_idx * 8;\n        const int row = elem_idx / BLOCK_K;\n        const int col = elem_idx % BLOCK_K;\n        const int g_row = m0 + row;\n        const int g_col = k0 + col;\n\n        const __half* gptr = A + g_row * lda + g_col;\n        int4 data;\n\n        if (g_row < M) {\n          data = *reinterpret_cast<const int4*>(gptr);\n        } else {\n          data = {0, 0, 0, 0};\n        }\n\n        __half* sptr = A_smem + row * (BLOCK_K + SKEW_HALF) + col;\n        *reinterpret_cast<int4*>(sptr) = data;\n      }\n    }\n\n    // Load B tile into shared memory as [BLOCK_N x BLOCK_K] row-major with stride (BLOCK_K + SKEW_HALF)\n    {\n      const int total_vec = (BLOCK_N * BLOCK_K) / 8; // 2048\n#pragma unroll\n      for (int v = 0; v < (total_vec / THREADS_PER_BLOCK); ++v) {\n        const int vec_idx = tid + v * THREADS_PER_BLOCK;\n        const int elem_idx = vec_idx * 8;\n        const int n = elem_idx / BLOCK_K;\n        const int kk = elem_idx % BLOCK_K;\n\n        const __half* gptr = B + (n0 + n) * ldb + (k0 + kk);\n        int4 data = *reinterpret_cast<const int4*>(gptr);\n\n        __half* sptr = B_smem + n * (BLOCK_K + SKEW_HALF) + kk;\n        *reinterpret_cast<int4*>(sptr) = data;\n      }\n    }\n\n    __syncthreads();\n\n    // Compute using WMMA over BLOCK_K split into 16-wide k-steps\n#pragma unroll\n    for (int kk = 0; kk < BLOCK_K; kk += WMMA_K) {\n      // Preload 4 B fragments for this warp (across N within the warp tile)\n      wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, __half, wmma::col_major> b_frag[WARP_TILE_N / WMMA_N];\n#pragma unroll\n      for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n        const int n_off = warp_n_tile * WARP_TILE_N + j * WMMA_N;\n        const __half* b_tile_ptr = B_smem + n_off * (BLOCK_K + SKEW_HALF) + kk;\n        wmma::load_matrix_sync(b_frag[j], b_tile_ptr, (BLOCK_K + SKEW_HALF));\n      }\n\n      // For each of 4 A subtiles in M within the warp tile, multiply with 4 B fragments\n#pragma unroll\n      for (int i = 0; i < (WARP_TILE_M / WMMA_M); ++i) {\n        const int m_off = warp_m_tile * WARP_TILE_M + i * WMMA_M;\n        const __half* a_tile_ptr = A_smem + m_off * (BLOCK_K + SKEW_HALF) + kk;\n\n        wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, __half, wmma::row_major> a_frag;\n        wmma::load_matrix_sync(a_frag, a_tile_ptr, (BLOCK_K + SKEW_HALF));\n\n#pragma unroll\n        for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n          wmma::mma_sync(c_frag[i][j], a_frag, b_frag[j], c_frag[i][j]);\n        }\n      }\n    }\n\n    __syncthreads();\n  }\n\n  // Store accumulators to shared C_smem (float), then cooperatively convert/store to global as half\n#pragma unroll\n  for (int i = 0; i < (WARP_TILE_M / WMMA_M); ++i) {\n#pragma unroll\n    for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n      const int row = warp_m_tile * WARP_TILE_M + i * WMMA_M;\n      const int col = warp_n_tile * WARP_TILE_N + j * WMMA_N;\n      float* c_tile_ptr = C_smem + row * (BLOCK_N + SKEW_FLOAT) + col;\n      wmma::store_matrix_sync(c_tile_ptr, c_frag[i][j], (BLOCK_N + SKEW_FLOAT), wmma::mem_row_major);\n    }\n  }\n\n  __syncthreads();\n\n  // Cooperative conversion and store to global memory\n  const int total_elems = BLOCK_M * BLOCK_N; // 32768\n#pragma unroll 4\n  for (int idx = tid; idx < total_elems; idx += THREADS_PER_BLOCK) {\n    const int row = idx / BLOCK_N;\n    const int col = idx % BLOCK_N;\n    const int g_row = m0 + row;\n    const int g_col = n0 + col;\n\n    if (g_row < M) {\n      float val = C_smem[row * (BLOCK_N + SKEW_FLOAT) + col];\n      __half h = __float2half_rn(val);\n      C[g_row * ldc + g_col] = h;\n    }\n  }\n}\n\n// TVM FFI binding function\nvoid gemm_n_4096_k_4096(tvm::ffi::TensorView A, tvm::ffi::TensorView B, tvm::ffi::TensorView C) {\n  // Validate inputs\n  TVM_FFI_ICHECK_EQ(A.ndim(), 2) << \"A must be 2D [M, 4096]\";\n  TVM_FFI_ICHECK_EQ(B.ndim(), 2) << \"B must be 2D [4096, 4096]\";\n  TVM_FFI_ICHECK_EQ(C.ndim(), 2) << \"C must be 2D [M, 4096]\";\n  \n  TVM_FFI_ICHECK_EQ(A.size(1), GEMM_K_CONST) << \"A.shape[1] must be 4096 (K)\";\n  TVM_FFI_ICHECK_EQ(B.size(0), GEMM_N_CONST) << \"B.shape[0] must be 4096 (N)\";\n  TVM_FFI_ICHECK_EQ(B.size(1), GEMM_K_CONST) << \"B.shape[1] must be 4096 (K)\";\n  \n  const int64_t M = A.size(0);\n  TVM_FFI_ICHECK_EQ(C.size(0), M) << \"C.shape[0] must match A.shape[0]\";\n  TVM_FFI_ICHECK_EQ(C.size(1), GEMM_N_CONST) << \"C.shape[1] must be 4096 (N)\";\n  \n  // Check dtype\n  DLDataType dt_a = A.dtype();\n  DLDataType dt_b = B.dtype();\n  DLDataType dt_c = C.dtype();\n  \n  if (dt_a.code != kDLFloat || dt_a.bits != 16) {\n    TVM_FFI_THROW(TypeError) << \"A must be float16\";\n  }\n  if (dt_b.code != kDLFloat || dt_b.bits != 16) {\n    TVM_FFI_THROW(TypeError) << \"B must be float16\";\n  }\n  if (dt_c.code != kDLFloat || dt_c.bits != 16) {\n    TVM_FFI_THROW(TypeError) << \"C must be float16\";\n  }\n  \n  // Check contiguous\n  TVM_FFI_ICHECK(A.IsContiguous()) << \"A must be contiguous\";\n  TVM_FFI_ICHECK(B.IsContiguous()) << \"B must be contiguous\";\n  TVM_FFI_ICHECK(C.IsContiguous()) << \"C must be contiguous\";\n  \n  // Check device\n  DLDevice dev = A.device();\n  TVM_FFI_ICHECK_EQ(dev.device_type, kDLCUDA) << \"Tensors must be on CUDA device\";\n  TVM_FFI_ICHECK_EQ(B.device().device_type, kDLCUDA) << \"Tensors must be on CUDA device\";\n  TVM_FFI_ICHECK_EQ(C.device().device_type, kDLCUDA) << \"Tensors must be on CUDA device\";\n  \n  if (M <= 0) return;\n  \n  // Get data pointers\n  const __half* A_ptr = reinterpret_cast<const __half*>(A.data_ptr());\n  const __half* B_ptr = reinterpret_cast<const __half*>(B.data_ptr());\n  __half* C_ptr = reinterpret_cast<__half*>(C.data_ptr());\n  \n  // Get CUDA stream from environment\n  cudaStream_t stream = static_cast<cudaStream_t>(\n      TVMFFIEnvGetStream(dev.device_type, dev.device_id));\n  \n  // Launch configuration\n  dim3 block(THREADS_PER_BLOCK, 1, 1);\n  dim3 grid(GEMM_N_CONST / BLOCK_N, ceil_div(static_cast<int>(M), BLOCK_M), 1);\n  \n  // Dynamic shared memory size\n  const int A_smem_elems = BLOCK_M * (BLOCK_K + SKEW_HALF);\n  const int B_smem_elems = BLOCK_N * (BLOCK_K + SKEW_HALF);\n  const int C_smem_elems = BLOCK_M * (BLOCK_N + SKEW_FLOAT);\n  \n  const size_t shmem_bytes =\n      A_smem_elems * sizeof(__half) +\n      B_smem_elems * sizeof(__half) +\n      C_smem_elems * sizeof(float);\n  \n  // Opt-in to large dynamic shared memory if needed\n  CUDA_CHECK(cudaFuncSetAttribute(gemm_n_4096_k_4096_kernel,\n                                  cudaFuncAttributeMaxDynamicSharedMemorySize,\n                                  (int)shmem_bytes));\n  \n  gemm_n_4096_k_4096_kernel<<<grid, block, shmem_bytes, stream>>>(A_ptr, B_ptr, C_ptr, static_cast<int>(M));\n  CUDA_CHECK(cudaGetLastError());\n}\n\n// Export the function with TVM FFI\nTVM_FFI_DLL_EXPORT_TYPED_FUNC(gemm_n_4096_k_4096, gemm_n_4096_k_4096);"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/examples/ffi/Makefile b/examples/ffi/Makefile
new file mode 100644
index 0000000..f357930
--- /dev/null
+++ b/examples/ffi/Makefile
@@ -0,0 +1,104 @@
+# Makefile for building the C++ TVM-FFI example
+#
+# Usage:
+#   make              # Build the example
+#   make run          # Build and run the example
+#   make clean        # Clean build artifacts
+
+# Compiler settings
+CXX := g++
+NVCC := nvcc
+CXXFLAGS := -std=c++17 -O3 -Wall
+
+# TVM-FFI paths
+# You can override these by setting environment variables:
+#   make TVM_FFI_HOME=/path/to/tvm-ffi
+# Or specify custom paths directly:
+#   make TVM_FFI_INCLUDE=/path/to/include TVM_FFI_LIB=/path/to/lib
+PYTHON ?= python3
+
+TVM_FFI_PKG := $(shell $(PYTHON) -c "import tvm_ffi; print(tvm_ffi.__path__[0])" 2>/dev/null)
+
+ifeq ($(TVM_FFI_PKG),)
+    $(warning tvm_ffi not found via Python, falling back to /usr/local)
+    TVM_FFI_PKG := /usr/local
+    TVM_FFI_INCLUDE ?= $(TVM_FFI_PKG)/include
+    TVM_FFI_LIB ?= $(TVM_FFI_PKG)/lib
+    DLPACK_INCLUDE ?= $(TVM_FFI_PKG)/include
+else
+    TVM_FFI_INCLUDE ?= $(TVM_FFI_PKG)/include
+    TVM_FFI_LIB ?= $(TVM_FFI_PKG)/lib
+    DLPACK_INCLUDE ?= $(TVM_FFI_PKG)/../3rdparty/dlpack/include
+endif
+
+# CUDA paths
+CUDA_HOME ?= /usr/local/cuda
+CUDA_INCLUDE := $(CUDA_HOME)/include
+CUDA_LIB := $(CUDA_HOME)/lib64
+
+# Include and library flags
+INCLUDES := -I$(TVM_FFI_INCLUDE) -I$(DLPACK_INCLUDE) -I$(CUDA_INCLUDE)
+LDFLAGS := -L$(TVM_FFI_LIB) -L$(CUDA_LIB)
+LIBS := -ltvm_ffi -lcuda -lcudart
+RPATH := -Wl,-rpath=$(TVM_FFI_LIB) -Wl,-rpath=$(CUDA_LIB)
+
+TARGET := cpp_example
+SOURCE := cpp_example.cc
+
+.PHONY: all run clean help check
+
+all: $(TARGET)
+
+check:
+	@echo "Detected Configuration:"
+	@echo "  Python:           $(PYTHON)"
+	@echo "  TVM-FFI Package:  $(TVM_FFI_PKG)"
+	@echo "  TVM-FFI Include:  $(TVM_FFI_INCLUDE)"
+	@echo "  TVM-FFI Library:  $(TVM_FFI_LIB)"
+	@echo "  DLPack Include:   $(DLPACK_INCLUDE)"
+	@echo "  CUDA Home:        $(CUDA_HOME)"
+	@echo "  CUDA Include:     $(CUDA_INCLUDE)"
+	@echo "  CUDA Library:     $(CUDA_LIB)"
+
+$(TARGET): $(SOURCE)
+	@echo "Building $(TARGET)..."
+	@echo "  TVM-FFI include: $(TVM_FFI_INCLUDE)"
+	@echo "  DLPack include: $(DLPACK_INCLUDE)"
+	@echo "  TVM-FFI lib: $(TVM_FFI_LIB)"
+	@echo "  CUDA include: $(CUDA_INCLUDE)"
+	@echo "  CUDA lib: $(CUDA_LIB)"
+	$(CXX) $(CXXFLAGS) $(INCLUDES) $(SOURCE) -o $(TARGET) $(LDFLAGS) $(LIBS) $(RPATH)
+	@echo "✓ Build successful: ./$(TARGET)"
+
+run: $(TARGET)
+	@echo "Running $(TARGET)..."
+	@echo ""
+	./$(TARGET)
+
+clean:
+	rm -f $(TARGET)
+	@echo "Cleaned build artifacts"
+
+help:
+	@echo "TVM-FFI C++ Example Makefile"
+	@echo ""
+	@echo "Targets:"
+	@echo "  make        - Build the C++ example"
+	@echo "  make run    - Build and run the example"
+	@echo "  make check  - Show detected paths and configuration"
+	@echo "  make clean  - Remove build artifacts"
+	@echo "  make help   - Show this help message"
+	@echo ""
+	@echo "Environment Variables:"
+	@echo "  PYTHON            - Python executable to use for detection (default: python3)"
+	@echo "  TVM_FFI_INCLUDE   - Path to TVM-FFI headers (default: auto-detect)"
+	@echo "  TVM_FFI_LIB       - Path to TVM-FFI libraries (default: auto-detect)"
+	@echo "  DLPACK_INCLUDE    - Path to DLPack headers (default: auto-detect)"
+	@echo "  CUDA_HOME         - Path to CUDA installation (default: /usr/local/cuda)"
+	@echo ""
+	@echo "Examples:"
+	@echo "  make                                    # Auto-detect all paths"
+	@echo "  make PYTHON=python3.12                  # Use specific Python version"
+	@echo "  make CUDA_HOME=/usr/local/cuda-12.0     # Use specific CUDA version"
+	@echo "  make TVM_FFI_INCLUDE=/custom/include TVM_FFI_LIB=/custom/lib  # Custom paths"
+
diff --git a/examples/ffi/README.md b/examples/ffi/README.md
new file mode 100644
index 0000000..4687caa
--- /dev/null
+++ b/examples/ffi/README.md
@@ -0,0 +1,69 @@
+# FlashInfer Bench (TVM-FFI) Kernel Distribution Example
+
+This directory contains examples demonstrating how to build, distribute, and load agent generated CUDA kernels using TVM-FFI across different environments.
+
+## Installation Prerequisites
+
+### Python Dependencies
+```bash
+pip install flashinfer-bench tvm-ffi torch
+```
+
+### For JAX Example
+```bash
+pip install jax jax-tvm-ffi
+```
+
+### For C++ Example
+- CUDA Toolkit
+- TVM-FFI C++ headers and libraries
+- C++17 compatible compiler
+
+## Usage
+
+### 1. Generate Kernel with Agent
+
+Use your preferred coding agent to generate a GEMM kernel solution following `agent_vibecode.md`. Or use the provided example vibecoded kernel using gpt-5-2025-08-07.
+
+### 2. Build and Distribute
+```bash
+cd /flashinfer-bench/examples/ffi
+python distribute_kernel.py
+```
+
+This builds the kernel and extracts `kernel.so` to `distributed/` folder.
+
+### 3. Run Examples
+
+**JAX:**
+```bash
+python jax_example.py
+```
+
+**PyTorch:**
+```bash
+python pytorch_example.py
+```
+
+**C++:**
+```bash
+make run
+```
+
+Each example loads the distributed kernel, executes it, and prints output shape and elements.
+
+## How It Works
+
+The kernel is built using `TVMFFIBuilder`, producing a self-contained `.so` file. This binary can be loaded across different runtimes:
+
+- **JAX**: Uses `tvm_ffi.load_module()` and `jax.ffi.ffi_call()`
+- **PyTorch**: Uses `torch.utils.cpp_extension.load()` with custom CUDA extensions
+- **C++**: Uses `ffi::Module::LoadFromFile()`
+
+The same `.so` file works across all frameworks without recompilation.
+
+## Notes
+
+- Kernels use destination-passing style (pre-allocated outputs)
+- All examples use CUDA tensors on GPU device 0
+- Entry point format: `file.cu::function_name`
\ No newline at end of file
diff --git a/examples/ffi/agent.md b/examples/ffi/agent_vibecode.md
similarity index 99%
rename from examples/ffi/agent.md
rename to examples/ffi/agent_vibecode.md
index 0523307..5628ae2 100644
--- a/examples/ffi/agent.md
+++ b/examples/ffi/agent_vibecode.md
@@ -173,7 +173,7 @@ The output JSON must conform to the Solution schema and be written to:
 
 ```json
 {
-  "name": "agent_vibecode_gemm",
+  "name": "agent_example_gemm",
   "definition": "gemm_n4096_k4096",
   "description": "High-performance CUDA GEMM implementation for C = A @ B.T using TVM FFI bindings",
   "author": "vibecode-agent",
diff --git a/examples/ffi/cpp_example.cc b/examples/ffi/cpp_example.cc
new file mode 100644
index 0000000..0126d2b
--- /dev/null
+++ b/examples/ffi/cpp_example.cc
@@ -0,0 +1,173 @@
+/**
+ * C++ example: Load and run the distributed .so kernel using TVM-FFI C++ API.
+ * Build with:
+ *   g++ -std=c++17 cpp_example.cc -o cpp_example \
+ *       -I/path/to/tvm-ffi/include \
+ *       -L/path/to/tvm-ffi/lib \
+ *       -ltvm_ffi -lcuda -lcublas \
+ *       -Wl,-rpath=/path/to/tvm-ffi/lib
+ */
+
+ #include <tvm/ffi/container/tensor.h>
+ #include <tvm/ffi/extra/module.h>
+ #include <tvm/ffi/extra/c_env_api.h>
+ #include <dlpack/dlpack.h>
+ #include <iostream>
+ #include <fstream>
+ #include <string>
+ #include <vector>
+ #include <cmath>
+ #include <cuda_runtime.h>
+ #include <cuda_fp16.h>
+ 
+ namespace ffi = tvm::ffi;
+ 
+ std::string read_entry_symbol(const std::string& metadata_path) {
+     std::ifstream file(metadata_path);
+     std::string line;
+     while (std::getline(file, line)) {
+         if (line.find("Entry Symbol:") == 0) {
+             size_t colon_pos = line.find(":");
+             if (colon_pos != std::string::npos) {
+                 std::string symbol = line.substr(colon_pos + 1);
+                 size_t start = symbol.find_first_not_of(" \t");
+                 size_t end = symbol.find_last_not_of(" \t\r\n");
+                 if (start != std::string::npos && end != std::string::npos) {
+                     return symbol.substr(start, end - start + 1);
+                 }
+             }
+         }
+     }
+     return "";
+ }
+ 
+ void init_random_tensor(ffi::TensorView tensor) {
+     size_t num_elements = 1;
+     for (int i = 0; i < tensor.ndim(); ++i) {
+         num_elements *= tensor.shape()[i];
+     }
+     
+     std::vector<__half> host_data(num_elements);
+     
+     for (size_t i = 0; i < num_elements; ++i) {
+         host_data[i] = __float2half(static_cast<float>(rand()) / RAND_MAX);
+     }
+     
+     cudaMemcpy(tensor.data_ptr(), host_data.data(), 
+                num_elements * sizeof(__half), cudaMemcpyHostToDevice);
+ }
+ 
+ int cuda_tensor_alloc(DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx,
+                      void (*SetError)(void* error_ctx, const char* kind, const char* message)) {
+     size_t num_bytes = 1;
+     for (int i = 0; i < prototype->ndim; ++i) {
+         num_bytes *= prototype->shape[i];
+     }
+     num_bytes *= (prototype->dtype.bits * prototype->dtype.lanes + 7) / 8;
+     
+     void* ptr;
+     cudaError_t err = cudaMalloc(&ptr, num_bytes);
+     if (err != cudaSuccess) {
+         if (SetError) {
+             SetError(error_ctx, "RuntimeError", cudaGetErrorString(err));
+         }
+         return -1;
+     }
+     
+     int64_t* shape = new int64_t[prototype->ndim];
+     int64_t* strides = nullptr;
+     for (int i = 0; i < prototype->ndim; ++i) {
+         shape[i] = prototype->shape[i];
+     }
+     if (prototype->strides) {
+         strides = new int64_t[prototype->ndim];
+         for (int i = 0; i < prototype->ndim; ++i) {
+             strides[i] = prototype->strides[i];
+         }
+     }
+     
+     // Allocate DLManageTensorVersioned structure
+     DLManagedTensorVersioned* managed = new DLManagedTensorVersioned();
+     managed->version = {1, 0};
+     managed->manager_ctx = nullptr;
+     managed->flags = 0;
+     
+     // Setup deleter
+     managed->deleter = [](DLManagedTensorVersioned* self) {
+         if (self->dl_tensor.data) {
+             cudaFree(self->dl_tensor.data);
+         }
+         delete[] self->dl_tensor.shape;
+         if (self->dl_tensor.strides) {
+             delete[] self->dl_tensor.strides;
+         }
+         delete self;
+     };
+     
+     // Setup DLTensor
+     managed->dl_tensor = *prototype;
+     managed->dl_tensor.data = ptr;
+     managed->dl_tensor.shape = shape;
+     managed->dl_tensor.strides = strides;
+     
+     *out = managed;
+     return 0;
+ }
+ 
+ ffi::Tensor allocate_cuda_tensor(std::vector<int64_t> shape, DLDataType dtype) {
+     DLDevice device{kDLCUDA, 0};
+     ffi::ShapeView shape_view(shape.data(), shape.size());
+     return ffi::Tensor::FromEnvAlloc(TVMFFIEnvTensorAlloc, shape_view, dtype, device);
+ }
+ 
+ int main() {
+     cudaSetDevice(0);
+     
+     DLDevice device{kDLCUDA, 0};
+     TVMFFIEnvSetDLPackManagedTensorAllocator(cuda_tensor_alloc, 1, nullptr);
+     
+     cudaStream_t stream;
+     cudaStreamCreate(&stream);
+     TVMFFIEnvSetStream(device.device_type, device.device_id, stream, nullptr);
+     
+     const std::string dist_dir = "distributed";
+     std::string entry_symbol = read_entry_symbol(dist_dir + "/kernel_metadata.txt");
+     
+     ffi::Module mod = ffi::Module::LoadFromFile(dist_dir + "/kernel.so");
+     ffi::Function kernel_fn = mod->GetFunction(entry_symbol).value();
+     
+     std::cout << "Loaded kernel: " << entry_symbol << std::endl;
+     
+     // Prepare inputs: C = A @ B.T
+     const int64_t M = 1024, N = 4096, K = 4096;
+     DLDataType dtype{kDLFloat, 16, 1};
+     
+     ffi::Tensor A = allocate_cuda_tensor({M, K}, dtype);
+     ffi::Tensor B = allocate_cuda_tensor({N, K}, dtype);
+     ffi::Tensor C = allocate_cuda_tensor({M, N}, dtype);
+     
+     init_random_tensor(A);
+     init_random_tensor(B);
+     cudaMemset(C.data_ptr(), 0, M * N * sizeof(__half));
+     
+     kernel_fn(A, B, C);
+     cudaDeviceSynchronize();
+     
+     std::vector<__half> host_output(M * N);
+     cudaMemcpy(host_output.data(), C.data_ptr(), 
+                M * N * sizeof(__half), cudaMemcpyDeviceToHost);
+     
+     std::cout << "Output shape: (" << M << ", " << N << ")" << std::endl;
+     
+     std::cout << "First 10 elements: [";
+     size_t num_to_print = std::min(size_t(10), host_output.size());
+     for (size_t i = 0; i < num_to_print; ++i) {
+         std::cout << __half2float(host_output[i]);
+         if (i < num_to_print - 1) std::cout << ", ";
+     }
+     std::cout << "]" << std::endl;
+     
+     cudaStreamDestroy(stream);
+     
+     return 0;
+ }
\ No newline at end of file
diff --git a/examples/ffi/distribute_kernel.py b/examples/ffi/distribute_kernel.py
new file mode 100644
index 0000000..5516df6
--- /dev/null
+++ b/examples/ffi/distribute_kernel.py
@@ -0,0 +1,39 @@
+import shutil
+from pathlib import Path
+
+from flashinfer_bench import TraceSet
+from flashinfer_bench.compile.builders.tvm_ffi_builder import TVMFFIBuilder
+
+
+def main():
+    traceset = TraceSet.from_path("Example-FlashInfer-Trace")
+    
+    definition_name = "gemm_n4096_k4096"
+    definition = traceset.definitions[definition_name]
+    
+    solutions = list(traceset.solutions[definition_name])
+    solution = solutions[0]
+    print(f"Building solution: {solution.name}")
+    
+    builder = TVMFFIBuilder()
+    runnable = builder.build(definition, solution)
+    
+    so_path = runnable.meta["binary"]
+    entry_symbol = runnable.meta["symbol"]
+    
+    dist_dir = Path("distributed")
+    dist_dir.mkdir(parents=True, exist_ok=True)
+    shutil.copy2(so_path, dist_dir / "kernel.so")
+    
+    with open(dist_dir / "kernel_metadata.txt", "w") as f:
+        f.write(f"Entry Symbol: {entry_symbol}\n")
+        f.write(f"Definition: {definition.name}\n")
+        f.write(f"Solution: {solution.name}\n")
+    
+    print(f"Built kernel: {dist_dir / 'kernel.so'}")
+    print(f"Entry symbol: {entry_symbol}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/examples/ffi/e2e_kernel.py b/examples/ffi/e2e_kernel.py
deleted file mode 100644
index 5cab613..0000000
--- a/examples/ffi/e2e_kernel.py
+++ /dev/null
@@ -1,71 +0,0 @@
-"""
-End-to-end example script, runs benchmark on agent generated gemm kernel, and applies it to a test input.
-"""
-
-import torch
-
-from flashinfer_bench import (
-    Benchmark,
-    BenchmarkConfig,
-    TraceSet,
-    apply,
-    disable_apply,
-    enable_apply,
-)
-
-
-def main():
-    traceset_path = "Example-FlashInfer-Trace"
-
-    print(f"Loading Example-FlashInfer-Trace")
-    traceset = TraceSet.from_path(traceset_path)
-
-    # Run benchmark
-    print("\nRunning Benchmark on agent generated gemm kernel")
-    benchmark = Benchmark(traceset, BenchmarkConfig())
-    result_traceset = benchmark.run_all()
-
-    print("\nBenchmark Complete")
-    for def_name, traces in result_traceset.traces.items():
-        print(f"\n{def_name}:")
-        for trace in traces:
-            print(f"  Solution: {trace.solution.name}")
-            print(f"  Status: {trace.evaluation.status.value}")
-            if trace.evaluation.performance:
-                print(f"  Speedup: {trace.evaluation.performance.speedup_factor:.2f}x")
-
-    # Apply kernel
-    print("\nApplying generated kernel on test input")
-
-    with enable_apply(traceset_path):
-        # Random test inputs for gemm_n4096_k4096
-        # A: [M, K] = [1024, 4096]
-        # B: [N, K] = [4096, 4096]
-        # C = A @ B.T -> [1024, 4096]
-
-        M, N, K = 1024, 4096, 4096
-        A = torch.randn(M, K, dtype=torch.float16, device="cuda")
-        B = torch.randn(N, K, dtype=torch.float16, device="cuda")
-
-        def reference_gemm(A, B):
-            return torch.matmul(A, B.T)
-
-        ref_sol = reference_gemm(A, B)
-
-        test_sol = apply(
-            "gemm_n4096_k4096", runtime_kwargs={"A": A, "B": B}, fallback=reference_gemm
-        )
-
-        if test_sol is not None:
-            diff = torch.abs(ref_sol - test_sol).max().item()
-            print(f"Max difference vs reference: {diff}")
-            if diff < 1e-2:
-                print("Vibe coded kernel is correct")
-            else:
-                print("Vibe coded kernel differs from torch matmul")
-        else:
-            print("No result returned (using fallback)")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/ffi/jax_example.py b/examples/ffi/jax_example.py
new file mode 100644
index 0000000..5515326
--- /dev/null
+++ b/examples/ffi/jax_example.py
@@ -0,0 +1,62 @@
+"""
+JAX example: Load and run the distributed .so kernel using jax-tvm-ffi.
+
+Requirements:
+    pip install jax jax-tvm-ffi
+"""
+
+import jax
+import jax.numpy as jnp
+import tvm_ffi
+import jax_tvm_ffi
+from pathlib import Path
+
+
+def main():
+    dist_dir = Path("distributed")
+    so_path = dist_dir / "kernel.so"
+    
+    entry_symbol = None
+    for line in (dist_dir / "kernel_metadata.txt").read_text().split('\n'):
+        if line.startswith("Entry Symbol:"):
+            entry_symbol = line.split(":", 1)[1].strip()
+            break
+    
+    # Load and register kernel
+    mod = tvm_ffi.load_module(str(so_path))
+    kernel_fn = getattr(mod, entry_symbol)
+    jax_tvm_ffi.register_ffi_target(entry_symbol, kernel_fn, platform="gpu")
+    
+    print(f"Loaded kernel: {entry_symbol}")
+    
+    # Prepare inputs: C = A @ B.T
+    M, N, K = 1024, 4096, 4096
+    jax_device = jax.devices("gpu")[0]
+    A = jnp.array(
+        jax.random.normal(jax.random.PRNGKey(0), (M, K)),
+        dtype=jnp.float16,
+        device=jax_device
+    )
+    B = jnp.array(
+        jax.random.normal(jax.random.PRNGKey(1), (N, K)),
+        dtype=jnp.float16,
+        device=jax_device
+    )
+        
+    result = jax.ffi.ffi_call(
+        entry_symbol,
+        jax.ShapeDtypeStruct((M, N), jnp.float16),
+        vmap_method="broadcast_all",
+    )(A, B)
+    
+    # Verify
+    reference = jnp.matmul(A, B.T)
+    max_diff = jnp.abs(result - reference).max()
+    
+    print(f"Max diff vs reference: {max_diff:.6f}")
+    print(f"Correct: {max_diff < 1e-2}")
+
+
+if __name__ == "__main__":
+    main()
+
diff --git a/examples/ffi/pytorch_example.py b/examples/ffi/pytorch_example.py
new file mode 100644
index 0000000..dbc46a7
--- /dev/null
+++ b/examples/ffi/pytorch_example.py
@@ -0,0 +1,49 @@
+"""
+PyTorch example: Load and run the distributed .so kernel using tvm-ffi.
+"""
+
+import torch
+import tvm_ffi
+from pathlib import Path
+
+def main():
+    dist_dir = Path("distributed")
+    so_path = dist_dir / "kernel.so"
+    
+    entry_symbol = None
+    for line in (dist_dir / "kernel_metadata.txt").read_text().split('\n'):
+        if line.startswith("Entry Symbol:"):
+            entry_symbol = line.split(":", 1)[1].strip()
+            break
+    
+    if entry_symbol is None:
+        raise ValueError("Entry symbol not found in metadata")
+    
+    mod = tvm_ffi.load_module(str(so_path))
+    kernel_fn = getattr(mod, entry_symbol)
+    
+    print(f"Loaded kernel: {entry_symbol}")
+    
+    # Prepare inputs: C = A @ B.T
+    M, N, K = 1024, 4096, 4096
+    
+    torch.manual_seed(0)
+    A = torch.randn(M, K, dtype=torch.float16, device="cuda")
+    
+    torch.manual_seed(1)
+    B = torch.randn(N, K, dtype=torch.float16, device="cuda")
+    
+    C = torch.empty(M, N, dtype=torch.float16, device="cuda")
+    
+    # Run kernel: C = A @ B.T
+    kernel_fn(A, B, C)
+    
+    # Verify against PyTorch reference
+    reference = torch.matmul(A, B.T)
+    max_diff = torch.abs(C - reference).max().item()
+    
+    print(f"Max diff vs reference: {max_diff:.6f}")
+    print(f"Correct: {max_diff < 1e-2}")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 170b14f7a6b9943bbcab77c71a264e520c27b7a7 Mon Sep 17 00:00:00 2001
From: Yiyan Zhai <98248913+YiyanZhai@users.noreply.github.com>
Date: Tue, 2 Dec 2025 16:29:17 -0500
Subject: [PATCH 08/10] Add workflow overview to README

Expanded the README with an overview of the FFI workflow.
---
 examples/ffi/README.md | 39 ++++++++++++++++++++++++++++++++++++++-
 1 file changed, 38 insertions(+), 1 deletion(-)

diff --git a/examples/ffi/README.md b/examples/ffi/README.md
index 4687caa..cc6debb 100644
--- a/examples/ffi/README.md
+++ b/examples/ffi/README.md
@@ -2,6 +2,43 @@
 
 This directory contains examples demonstrating how to build, distribute, and load agent generated CUDA kernels using TVM-FFI across different environments.
 
+
+## Overview
+
+The workflow consists of 4 main stages:
+
+```
+┌──────────────────────────────────────────────────────────────────────┐
+│ Stage 1: Problem Definition                                          │
+│ - Definition in FlashInfer-Trace dataset                             │
+└────────────────────────────┬─────────────────────────────────────────┘
+                             │
+                             ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│ Stage 2: LLM Kernel Generation                                       │
+│ - LLM reads the Definition + agent_vibecode.md prompt                │
+│ - Generates CUDA kernel with TVM-FFI bindings                        │
+│ - Outputs Solution JSON with embedded source code                    │
+└────────────────────────────┬─────────────────────────────────────────┘
+                             │
+                             ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│ Stage 3: Build & Distribution                                        │
+│ - TVMFFIBuilder compiles the Solution                                │
+│ - Generates framework-agnostic .so binary                            │
+│ - Extracts to distributed/ folder with metadata                      │
+└────────────────────────────┬─────────────────────────────────────────┘
+                             │
+                             ▼
+┌──────────────────────────────────────────────────────────────────────┐
+│ Stage 4: Cross-Framework Usage                                       │
+│ - JAX/PyTorch/C++ load the same .so file                             │
+│ - Execute kernel without recompilation                               │
+│ - Benchmarking or Apply the kernel                                   │
+└──────────────────────────────────────────────────────────────────────┘
+```
+
+
 ## Installation Prerequisites
 
 ### Python Dependencies
@@ -66,4 +103,4 @@ The same `.so` file works across all frameworks without recompilation.
 
 - Kernels use destination-passing style (pre-allocated outputs)
 - All examples use CUDA tensors on GPU device 0
-- Entry point format: `file.cu::function_name`
\ No newline at end of file
+- Entry point format: `file.cu::function_name`

From 47de48801e493a94fcac55ca6237cc652f6e8d7a Mon Sep 17 00:00:00 2001
From: zanderjiang <zander.jiang@gmail.com>
Date: Tue, 2 Dec 2025 23:34:33 -0500
Subject: [PATCH 09/10] use updated kernel generator to generate solution
 (additional option), reformat

---
 .../solutions/example_agent_solution.json     |   2 +-
 examples/ffi/Makefile                         |   1 -
 examples/ffi/README.md                        |  32 ++++-
 examples/ffi/cpp_example.cc                   |  64 +++++-----
 examples/ffi/distribute_kernel.py             |  15 ++-
 examples/ffi/jax_example.py                   |  34 +++--
 examples/ffi/kernel_generator_example.py      | 118 ++++++++++++++++++
 examples/ffi/pytorch_example.py               |  31 ++---
 8 files changed, 218 insertions(+), 79 deletions(-)
 create mode 100644 examples/ffi/kernel_generator_example.py

diff --git a/examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json b/examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json
index 01bc4b6..da154e1 100644
--- a/examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json
+++ b/examples/ffi/Example-FlashInfer-Trace/solutions/example_agent_solution.json
@@ -21,4 +21,4 @@
       "content": "#include \"kernel.h\"\n#include <tvm/ffi/container/tensor.h>\n#include <tvm/ffi/extra/c_env_api.h>\n#include <tvm/ffi/function.h>\n#include <tvm/ffi/error.h>\n#include <mma.h>\n#include <cstdio>\n\nusing namespace nvcuda;\n\n// Error check macro\n#ifndef CUDA_CHECK\n#define CUDA_CHECK(expr)                                                     \\\n  do {                                                                       \\\n    cudaError_t _err = (expr);                                               \\\n    if (_err != cudaSuccess) {                                               \\\n      fprintf(stderr, \"CUDA Error %s at %s:%d: %s\\n\", #expr, __FILE__, __LINE__, cudaGetErrorString(_err)); \\\n      abort();                                                               \\\n    }                                                                        \\\n  } while (0)\n#endif\n\n// Kernel configuration tuned for B200\n// - Block tile: 128 x 256 (M x N)\n// - K tile: 64\n// - 8 warps per block (256 threads), each warp computes a 64x64 sub-tile via WMMA (4x4 tiles of 16x16)\n// - Accumulate in FP32, convert to FP16 on store\nconstexpr int BLOCK_M = 128;\nconstexpr int BLOCK_N = 256;\nconstexpr int BLOCK_K = 64;\n\nconstexpr int WARPS_PER_BLOCK = 8;\nconstexpr int THREADS_PER_BLOCK = WARPS_PER_BLOCK * 32;\n\nconstexpr int WARP_TILE_M = 64;\nconstexpr int WARP_TILE_N = 64;\n\nconstexpr int WMMA_M = 16;\nconstexpr int WMMA_N = 16;\nconstexpr int WMMA_K = 16;\n\n// Padding to avoid shared memory bank conflicts (in elements)\nconstexpr int SKEW_HALF  = 8; // for half elements\nconstexpr int SKEW_FLOAT = 8; // for float elements\n\n// Align pointer p up to 'alignment' bytes\n__device__ __forceinline__ char* align_up(char* p, size_t alignment) {\n  uintptr_t ip = reinterpret_cast<uintptr_t>(p);\n  ip = (ip + (alignment - 1)) & ~(alignment - 1);\n  return reinterpret_cast<char*>(ip);\n}\n\n__global__ __launch_bounds__(THREADS_PER_BLOCK, 2)\nvoid gemm_n_4096_k_4096_kernel(const __half* __restrict__ A,\n                               const __half* __restrict__ B,\n                               __half* __restrict__ C,\n                               int M) {\n  // Shared memory layout (dynamically allocated):\n  // [A_smem (half) | B_smem (half) | C_smem (float)]\n  extern __shared__ char smem_raw[];\n  char* smem_ptr = smem_raw;\n\n  // Compute sizes\n  const int A_smem_elems = BLOCK_M * (BLOCK_K + SKEW_HALF);\n  const int B_smem_elems = BLOCK_N * (BLOCK_K + SKEW_HALF);\n  const int C_smem_elems = BLOCK_M * (BLOCK_N + SKEW_FLOAT);\n\n  const size_t A_smem_bytes = A_smem_elems * sizeof(__half);\n  const size_t B_smem_bytes = B_smem_elems * sizeof(__half);\n  const size_t C_smem_bytes = C_smem_elems * sizeof(float);\n\n  __half* A_smem = reinterpret_cast<__half*>(smem_ptr);\n  smem_ptr = align_up(smem_ptr + A_smem_bytes, 16);\n  __half* B_smem = reinterpret_cast<__half*>(smem_ptr);\n  smem_ptr = align_up(smem_ptr + B_smem_bytes, 16);\n  float*  C_smem = reinterpret_cast<float*>(smem_ptr);\n\n  // Block coordinates\n  const int block_m = blockIdx.y; // along M\n  const int block_n = blockIdx.x; // along N\n  const int m0 = block_m * BLOCK_M;\n  const int n0 = block_n * BLOCK_N;\n\n  // Early exit if out of range (shouldn't happen due to gridDim.y, but guard anyway)\n  if (m0 >= M) return;\n\n  // Global strides (row-major)\n  const int lda = GEMM_K_CONST; // 4096\n  const int ldb = GEMM_K_CONST; // 4096\n  const int ldc = GEMM_N_CONST; // 4096\n\n  // Thread identifiers\n  const int tid     = threadIdx.x;\n  const int warp_id = tid / 32;\n  const int lane_id = tid % 32;\n\n  // Warp tile coordinates within the block\n  const int WARPS_N = BLOCK_N / WARP_TILE_N; // 256/64 = 4\n  const int warp_m_tile = warp_id / WARPS_N; // 0..1\n  const int warp_n_tile = warp_id % WARPS_N; // 0..3\n\n  // Initialize accumulators\n  wmma::fragment<wmma::accumulator, WMMA_M, WMMA_N, WMMA_K, float> c_frag[WARP_TILE_M / WMMA_M][WARP_TILE_N / WMMA_N];\n#pragma unroll\n  for (int i = 0; i < (WARP_TILE_M / WMMA_M); ++i) {\n#pragma unroll\n    for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n      wmma::fill_fragment(c_frag[i][j], 0.0f);\n    }\n  }\n\n  // Loop over K dimension in tiles of BLOCK_K\n  for (int k0 = 0; k0 < GEMM_K_CONST; k0 += BLOCK_K) {\n\n    // Load A tile into shared memory: [BLOCK_M x BLOCK_K] with stride (BLOCK_K + SKEW_HALF)\n    {\n      const int total_vec = (BLOCK_M * BLOCK_K) / 8; // 1024\n#pragma unroll\n      for (int v = 0; v < (total_vec / THREADS_PER_BLOCK); ++v) {\n        const int vec_idx = tid + v * THREADS_PER_BLOCK;\n        const int elem_idx = vec_idx * 8;\n        const int row = elem_idx / BLOCK_K;\n        const int col = elem_idx % BLOCK_K;\n        const int g_row = m0 + row;\n        const int g_col = k0 + col;\n\n        const __half* gptr = A + g_row * lda + g_col;\n        int4 data;\n\n        if (g_row < M) {\n          data = *reinterpret_cast<const int4*>(gptr);\n        } else {\n          data = {0, 0, 0, 0};\n        }\n\n        __half* sptr = A_smem + row * (BLOCK_K + SKEW_HALF) + col;\n        *reinterpret_cast<int4*>(sptr) = data;\n      }\n    }\n\n    // Load B tile into shared memory as [BLOCK_N x BLOCK_K] row-major with stride (BLOCK_K + SKEW_HALF)\n    {\n      const int total_vec = (BLOCK_N * BLOCK_K) / 8; // 2048\n#pragma unroll\n      for (int v = 0; v < (total_vec / THREADS_PER_BLOCK); ++v) {\n        const int vec_idx = tid + v * THREADS_PER_BLOCK;\n        const int elem_idx = vec_idx * 8;\n        const int n = elem_idx / BLOCK_K;\n        const int kk = elem_idx % BLOCK_K;\n\n        const __half* gptr = B + (n0 + n) * ldb + (k0 + kk);\n        int4 data = *reinterpret_cast<const int4*>(gptr);\n\n        __half* sptr = B_smem + n * (BLOCK_K + SKEW_HALF) + kk;\n        *reinterpret_cast<int4*>(sptr) = data;\n      }\n    }\n\n    __syncthreads();\n\n    // Compute using WMMA over BLOCK_K split into 16-wide k-steps\n#pragma unroll\n    for (int kk = 0; kk < BLOCK_K; kk += WMMA_K) {\n      // Preload 4 B fragments for this warp (across N within the warp tile)\n      wmma::fragment<wmma::matrix_b, WMMA_M, WMMA_N, WMMA_K, __half, wmma::col_major> b_frag[WARP_TILE_N / WMMA_N];\n#pragma unroll\n      for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n        const int n_off = warp_n_tile * WARP_TILE_N + j * WMMA_N;\n        const __half* b_tile_ptr = B_smem + n_off * (BLOCK_K + SKEW_HALF) + kk;\n        wmma::load_matrix_sync(b_frag[j], b_tile_ptr, (BLOCK_K + SKEW_HALF));\n      }\n\n      // For each of 4 A subtiles in M within the warp tile, multiply with 4 B fragments\n#pragma unroll\n      for (int i = 0; i < (WARP_TILE_M / WMMA_M); ++i) {\n        const int m_off = warp_m_tile * WARP_TILE_M + i * WMMA_M;\n        const __half* a_tile_ptr = A_smem + m_off * (BLOCK_K + SKEW_HALF) + kk;\n\n        wmma::fragment<wmma::matrix_a, WMMA_M, WMMA_N, WMMA_K, __half, wmma::row_major> a_frag;\n        wmma::load_matrix_sync(a_frag, a_tile_ptr, (BLOCK_K + SKEW_HALF));\n\n#pragma unroll\n        for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n          wmma::mma_sync(c_frag[i][j], a_frag, b_frag[j], c_frag[i][j]);\n        }\n      }\n    }\n\n    __syncthreads();\n  }\n\n  // Store accumulators to shared C_smem (float), then cooperatively convert/store to global as half\n#pragma unroll\n  for (int i = 0; i < (WARP_TILE_M / WMMA_M); ++i) {\n#pragma unroll\n    for (int j = 0; j < (WARP_TILE_N / WMMA_N); ++j) {\n      const int row = warp_m_tile * WARP_TILE_M + i * WMMA_M;\n      const int col = warp_n_tile * WARP_TILE_N + j * WMMA_N;\n      float* c_tile_ptr = C_smem + row * (BLOCK_N + SKEW_FLOAT) + col;\n      wmma::store_matrix_sync(c_tile_ptr, c_frag[i][j], (BLOCK_N + SKEW_FLOAT), wmma::mem_row_major);\n    }\n  }\n\n  __syncthreads();\n\n  // Cooperative conversion and store to global memory\n  const int total_elems = BLOCK_M * BLOCK_N; // 32768\n#pragma unroll 4\n  for (int idx = tid; idx < total_elems; idx += THREADS_PER_BLOCK) {\n    const int row = idx / BLOCK_N;\n    const int col = idx % BLOCK_N;\n    const int g_row = m0 + row;\n    const int g_col = n0 + col;\n\n    if (g_row < M) {\n      float val = C_smem[row * (BLOCK_N + SKEW_FLOAT) + col];\n      __half h = __float2half_rn(val);\n      C[g_row * ldc + g_col] = h;\n    }\n  }\n}\n\n// TVM FFI binding function\nvoid gemm_n_4096_k_4096(tvm::ffi::TensorView A, tvm::ffi::TensorView B, tvm::ffi::TensorView C) {\n  // Validate inputs\n  TVM_FFI_ICHECK_EQ(A.ndim(), 2) << \"A must be 2D [M, 4096]\";\n  TVM_FFI_ICHECK_EQ(B.ndim(), 2) << \"B must be 2D [4096, 4096]\";\n  TVM_FFI_ICHECK_EQ(C.ndim(), 2) << \"C must be 2D [M, 4096]\";\n  \n  TVM_FFI_ICHECK_EQ(A.size(1), GEMM_K_CONST) << \"A.shape[1] must be 4096 (K)\";\n  TVM_FFI_ICHECK_EQ(B.size(0), GEMM_N_CONST) << \"B.shape[0] must be 4096 (N)\";\n  TVM_FFI_ICHECK_EQ(B.size(1), GEMM_K_CONST) << \"B.shape[1] must be 4096 (K)\";\n  \n  const int64_t M = A.size(0);\n  TVM_FFI_ICHECK_EQ(C.size(0), M) << \"C.shape[0] must match A.shape[0]\";\n  TVM_FFI_ICHECK_EQ(C.size(1), GEMM_N_CONST) << \"C.shape[1] must be 4096 (N)\";\n  \n  // Check dtype\n  DLDataType dt_a = A.dtype();\n  DLDataType dt_b = B.dtype();\n  DLDataType dt_c = C.dtype();\n  \n  if (dt_a.code != kDLFloat || dt_a.bits != 16) {\n    TVM_FFI_THROW(TypeError) << \"A must be float16\";\n  }\n  if (dt_b.code != kDLFloat || dt_b.bits != 16) {\n    TVM_FFI_THROW(TypeError) << \"B must be float16\";\n  }\n  if (dt_c.code != kDLFloat || dt_c.bits != 16) {\n    TVM_FFI_THROW(TypeError) << \"C must be float16\";\n  }\n  \n  // Check contiguous\n  TVM_FFI_ICHECK(A.IsContiguous()) << \"A must be contiguous\";\n  TVM_FFI_ICHECK(B.IsContiguous()) << \"B must be contiguous\";\n  TVM_FFI_ICHECK(C.IsContiguous()) << \"C must be contiguous\";\n  \n  // Check device\n  DLDevice dev = A.device();\n  TVM_FFI_ICHECK_EQ(dev.device_type, kDLCUDA) << \"Tensors must be on CUDA device\";\n  TVM_FFI_ICHECK_EQ(B.device().device_type, kDLCUDA) << \"Tensors must be on CUDA device\";\n  TVM_FFI_ICHECK_EQ(C.device().device_type, kDLCUDA) << \"Tensors must be on CUDA device\";\n  \n  if (M <= 0) return;\n  \n  // Get data pointers\n  const __half* A_ptr = reinterpret_cast<const __half*>(A.data_ptr());\n  const __half* B_ptr = reinterpret_cast<const __half*>(B.data_ptr());\n  __half* C_ptr = reinterpret_cast<__half*>(C.data_ptr());\n  \n  // Get CUDA stream from environment\n  cudaStream_t stream = static_cast<cudaStream_t>(\n      TVMFFIEnvGetStream(dev.device_type, dev.device_id));\n  \n  // Launch configuration\n  dim3 block(THREADS_PER_BLOCK, 1, 1);\n  dim3 grid(GEMM_N_CONST / BLOCK_N, ceil_div(static_cast<int>(M), BLOCK_M), 1);\n  \n  // Dynamic shared memory size\n  const int A_smem_elems = BLOCK_M * (BLOCK_K + SKEW_HALF);\n  const int B_smem_elems = BLOCK_N * (BLOCK_K + SKEW_HALF);\n  const int C_smem_elems = BLOCK_M * (BLOCK_N + SKEW_FLOAT);\n  \n  const size_t shmem_bytes =\n      A_smem_elems * sizeof(__half) +\n      B_smem_elems * sizeof(__half) +\n      C_smem_elems * sizeof(float);\n  \n  // Opt-in to large dynamic shared memory if needed\n  CUDA_CHECK(cudaFuncSetAttribute(gemm_n_4096_k_4096_kernel,\n                                  cudaFuncAttributeMaxDynamicSharedMemorySize,\n                                  (int)shmem_bytes));\n  \n  gemm_n_4096_k_4096_kernel<<<grid, block, shmem_bytes, stream>>>(A_ptr, B_ptr, C_ptr, static_cast<int>(M));\n  CUDA_CHECK(cudaGetLastError());\n}\n\n// Export the function with TVM FFI\nTVM_FFI_DLL_EXPORT_TYPED_FUNC(gemm_n_4096_k_4096, gemm_n_4096_k_4096);"
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/examples/ffi/Makefile b/examples/ffi/Makefile
index f357930..8da5a20 100644
--- a/examples/ffi/Makefile
+++ b/examples/ffi/Makefile
@@ -101,4 +101,3 @@ help:
 	@echo "  make PYTHON=python3.12                  # Use specific Python version"
 	@echo "  make CUDA_HOME=/usr/local/cuda-12.0     # Use specific CUDA version"
 	@echo "  make TVM_FFI_INCLUDE=/custom/include TVM_FFI_LIB=/custom/lib  # Custom paths"
-
diff --git a/examples/ffi/README.md b/examples/ffi/README.md
index 4687caa..8d61075 100644
--- a/examples/ffi/README.md
+++ b/examples/ffi/README.md
@@ -11,7 +11,7 @@ pip install flashinfer-bench tvm-ffi torch
 
 ### For JAX Example
 ```bash
-pip install jax jax-tvm-ffi
+pip install jax[cuda13-local] jax-tvm-ffi
 ```
 
 ### For C++ Example
@@ -23,7 +23,33 @@ pip install jax jax-tvm-ffi
 
 ### 1. Generate Kernel with Agent
 
-Use your preferred coding agent to generate a GEMM kernel solution following `agent_vibecode.md`. Or use the provided example vibecoded kernel using gpt-5-2025-08-07.
+You have two options to generate a CUDA kernel solution:
+
+**Option A: IDE Coding Agent**
+
+Use your preferred IDE coding agent to generate a GEMM kernel solution:
+
+```bash
+# Open the instructions in your IDE
+cat agent_vibecode.md
+```
+
+Follow the instructions in `agent_vibecode.md` to have the agent generate the solution interactively.
+
+**Option B: Kernel Generator**
+
+Use the kernel generator agent to generate solutions:
+
+```bash
+# Ensure .env is configured in examples/kernel_generator/
+# Required: LLM_API_KEY and BASE_URL
+python kernel_generator_example.py
+```
+
+Configure generation parameters in the script:
+- `model_name`: LLM model to use (default: `gpt-5-2025-08-07`)
+- `target_gpu`: Target GPU architecture (default: `B200`)
+- `gen_rounds`: Number of refinement rounds (default: `10`)
 
 ### 2. Build and Distribute
 ```bash
@@ -66,4 +92,4 @@ The same `.so` file works across all frameworks without recompilation.
 
 - Kernels use destination-passing style (pre-allocated outputs)
 - All examples use CUDA tensors on GPU device 0
-- Entry point format: `file.cu::function_name`
\ No newline at end of file
+- Entry point format: `file.cu::function_name`
diff --git a/examples/ffi/cpp_example.cc b/examples/ffi/cpp_example.cc
index 0126d2b..0f28c1d 100644
--- a/examples/ffi/cpp_example.cc
+++ b/examples/ffi/cpp_example.cc
@@ -19,9 +19,9 @@
  #include <cmath>
  #include <cuda_runtime.h>
  #include <cuda_fp16.h>
- 
+
  namespace ffi = tvm::ffi;
- 
+
  std::string read_entry_symbol(const std::string& metadata_path) {
      std::ifstream file(metadata_path);
      std::string line;
@@ -40,23 +40,23 @@
      }
      return "";
  }
- 
+
  void init_random_tensor(ffi::TensorView tensor) {
      size_t num_elements = 1;
      for (int i = 0; i < tensor.ndim(); ++i) {
          num_elements *= tensor.shape()[i];
      }
-     
+
      std::vector<__half> host_data(num_elements);
-     
+
      for (size_t i = 0; i < num_elements; ++i) {
          host_data[i] = __float2half(static_cast<float>(rand()) / RAND_MAX);
      }
-     
-     cudaMemcpy(tensor.data_ptr(), host_data.data(), 
+
+     cudaMemcpy(tensor.data_ptr(), host_data.data(),
                 num_elements * sizeof(__half), cudaMemcpyHostToDevice);
  }
- 
+
  int cuda_tensor_alloc(DLTensor* prototype, DLManagedTensorVersioned** out, void* error_ctx,
                       void (*SetError)(void* error_ctx, const char* kind, const char* message)) {
      size_t num_bytes = 1;
@@ -64,7 +64,7 @@
          num_bytes *= prototype->shape[i];
      }
      num_bytes *= (prototype->dtype.bits * prototype->dtype.lanes + 7) / 8;
-     
+
      void* ptr;
      cudaError_t err = cudaMalloc(&ptr, num_bytes);
      if (err != cudaSuccess) {
@@ -73,7 +73,7 @@
          }
          return -1;
      }
-     
+
      int64_t* shape = new int64_t[prototype->ndim];
      int64_t* strides = nullptr;
      for (int i = 0; i < prototype->ndim; ++i) {
@@ -85,13 +85,13 @@
              strides[i] = prototype->strides[i];
          }
      }
-     
+
      // Allocate DLManageTensorVersioned structure
      DLManagedTensorVersioned* managed = new DLManagedTensorVersioned();
      managed->version = {1, 0};
      managed->manager_ctx = nullptr;
      managed->flags = 0;
-     
+
      // Setup deleter
      managed->deleter = [](DLManagedTensorVersioned* self) {
          if (self->dl_tensor.data) {
@@ -103,62 +103,62 @@
          }
          delete self;
      };
-     
+
      // Setup DLTensor
      managed->dl_tensor = *prototype;
      managed->dl_tensor.data = ptr;
      managed->dl_tensor.shape = shape;
      managed->dl_tensor.strides = strides;
-     
+
      *out = managed;
      return 0;
  }
- 
+
  ffi::Tensor allocate_cuda_tensor(std::vector<int64_t> shape, DLDataType dtype) {
      DLDevice device{kDLCUDA, 0};
      ffi::ShapeView shape_view(shape.data(), shape.size());
      return ffi::Tensor::FromEnvAlloc(TVMFFIEnvTensorAlloc, shape_view, dtype, device);
  }
- 
+
  int main() {
      cudaSetDevice(0);
-     
+
      DLDevice device{kDLCUDA, 0};
      TVMFFIEnvSetDLPackManagedTensorAllocator(cuda_tensor_alloc, 1, nullptr);
-     
+
      cudaStream_t stream;
      cudaStreamCreate(&stream);
      TVMFFIEnvSetStream(device.device_type, device.device_id, stream, nullptr);
-     
+
      const std::string dist_dir = "distributed";
      std::string entry_symbol = read_entry_symbol(dist_dir + "/kernel_metadata.txt");
-     
+
      ffi::Module mod = ffi::Module::LoadFromFile(dist_dir + "/kernel.so");
      ffi::Function kernel_fn = mod->GetFunction(entry_symbol).value();
-     
+
      std::cout << "Loaded kernel: " << entry_symbol << std::endl;
-     
+
      // Prepare inputs: C = A @ B.T
      const int64_t M = 1024, N = 4096, K = 4096;
      DLDataType dtype{kDLFloat, 16, 1};
-     
+
      ffi::Tensor A = allocate_cuda_tensor({M, K}, dtype);
      ffi::Tensor B = allocate_cuda_tensor({N, K}, dtype);
      ffi::Tensor C = allocate_cuda_tensor({M, N}, dtype);
-     
+
      init_random_tensor(A);
      init_random_tensor(B);
      cudaMemset(C.data_ptr(), 0, M * N * sizeof(__half));
-     
+
      kernel_fn(A, B, C);
      cudaDeviceSynchronize();
-     
+
      std::vector<__half> host_output(M * N);
-     cudaMemcpy(host_output.data(), C.data_ptr(), 
+     cudaMemcpy(host_output.data(), C.data_ptr(),
                 M * N * sizeof(__half), cudaMemcpyDeviceToHost);
-     
+
      std::cout << "Output shape: (" << M << ", " << N << ")" << std::endl;
-     
+
      std::cout << "First 10 elements: [";
      size_t num_to_print = std::min(size_t(10), host_output.size());
      for (size_t i = 0; i < num_to_print; ++i) {
@@ -166,8 +166,8 @@
          if (i < num_to_print - 1) std::cout << ", ";
      }
      std::cout << "]" << std::endl;
-     
+
      cudaStreamDestroy(stream);
-     
+
      return 0;
- }
\ No newline at end of file
+ }
diff --git a/examples/ffi/distribute_kernel.py b/examples/ffi/distribute_kernel.py
index 5516df6..9831135 100644
--- a/examples/ffi/distribute_kernel.py
+++ b/examples/ffi/distribute_kernel.py
@@ -7,33 +7,32 @@
 
 def main():
     traceset = TraceSet.from_path("Example-FlashInfer-Trace")
-    
+
     definition_name = "gemm_n4096_k4096"
     definition = traceset.definitions[definition_name]
-    
+
     solutions = list(traceset.solutions[definition_name])
     solution = solutions[0]
     print(f"Building solution: {solution.name}")
-    
+
     builder = TVMFFIBuilder()
     runnable = builder.build(definition, solution)
-    
+
     so_path = runnable.meta["binary"]
     entry_symbol = runnable.meta["symbol"]
-    
+
     dist_dir = Path("distributed")
     dist_dir.mkdir(parents=True, exist_ok=True)
     shutil.copy2(so_path, dist_dir / "kernel.so")
-    
+
     with open(dist_dir / "kernel_metadata.txt", "w") as f:
         f.write(f"Entry Symbol: {entry_symbol}\n")
         f.write(f"Definition: {definition.name}\n")
         f.write(f"Solution: {solution.name}\n")
-    
+
     print(f"Built kernel: {dist_dir / 'kernel.so'}")
     print(f"Entry symbol: {entry_symbol}")
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/examples/ffi/jax_example.py b/examples/ffi/jax_example.py
index 5515326..8e155e3 100644
--- a/examples/ffi/jax_example.py
+++ b/examples/ffi/jax_example.py
@@ -5,58 +5,52 @@
     pip install jax jax-tvm-ffi
 """
 
+from pathlib import Path
+
 import jax
 import jax.numpy as jnp
-import tvm_ffi
 import jax_tvm_ffi
-from pathlib import Path
+import tvm_ffi
 
 
 def main():
     dist_dir = Path("distributed")
     so_path = dist_dir / "kernel.so"
-    
+
     entry_symbol = None
-    for line in (dist_dir / "kernel_metadata.txt").read_text().split('\n'):
+    for line in (dist_dir / "kernel_metadata.txt").read_text().split("\n"):
         if line.startswith("Entry Symbol:"):
             entry_symbol = line.split(":", 1)[1].strip()
             break
-    
+
     # Load and register kernel
     mod = tvm_ffi.load_module(str(so_path))
     kernel_fn = getattr(mod, entry_symbol)
     jax_tvm_ffi.register_ffi_target(entry_symbol, kernel_fn, platform="gpu")
-    
+
     print(f"Loaded kernel: {entry_symbol}")
-    
+
     # Prepare inputs: C = A @ B.T
     M, N, K = 1024, 4096, 4096
     jax_device = jax.devices("gpu")[0]
     A = jnp.array(
-        jax.random.normal(jax.random.PRNGKey(0), (M, K)),
-        dtype=jnp.float16,
-        device=jax_device
+        jax.random.normal(jax.random.PRNGKey(0), (M, K)), dtype=jnp.float16, device=jax_device
     )
     B = jnp.array(
-        jax.random.normal(jax.random.PRNGKey(1), (N, K)),
-        dtype=jnp.float16,
-        device=jax_device
+        jax.random.normal(jax.random.PRNGKey(1), (N, K)), dtype=jnp.float16, device=jax_device
     )
-        
+
     result = jax.ffi.ffi_call(
-        entry_symbol,
-        jax.ShapeDtypeStruct((M, N), jnp.float16),
-        vmap_method="broadcast_all",
+        entry_symbol, jax.ShapeDtypeStruct((M, N), jnp.float16), vmap_method="broadcast_all"
     )(A, B)
-    
+
     # Verify
     reference = jnp.matmul(A, B.T)
     max_diff = jnp.abs(result - reference).max()
-    
+
     print(f"Max diff vs reference: {max_diff:.6f}")
     print(f"Correct: {max_diff < 1e-2}")
 
 
 if __name__ == "__main__":
     main()
-
diff --git a/examples/ffi/kernel_generator_example.py b/examples/ffi/kernel_generator_example.py
new file mode 100644
index 0000000..2ca1f64
--- /dev/null
+++ b/examples/ffi/kernel_generator_example.py
@@ -0,0 +1,118 @@
+import json
+import os
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+from flashinfer_bench import TraceSet
+from flashinfer_bench.data import save_json_file
+
+# Get the path to examples/kernel_generator
+script_dir = Path(__file__).parent  # examples/ffi
+examples_dir = script_dir.parent  # examples
+kernel_gen_dir = examples_dir / "kernel_generator"
+
+sys.path.insert(0, str(kernel_gen_dir))
+from kernel_generator import KernelGenerator
+
+load_dotenv(kernel_gen_dir / ".env")
+
+
+def main():
+    """
+    Generate optimized CUDA solutions for FFI bindings.
+    """
+    # TODO: select model, target gpu, definition
+    model_name = "gpt-5-2025-08-07"  # Choose model
+    language = "cuda"  # Target CUDA for FFI bindings
+    target_gpu = "B200"  # Target GPU
+
+    print(f"Loading Example TraceSet")
+    traceset_path = script_dir / "Example-FlashInfer-Trace"
+    traceset = TraceSet.from_path(traceset_path)
+
+    definition_name = "gemm_n4096_k4096"
+    definition = traceset.definitions[definition_name]
+
+    api_key = os.getenv("LLM_API_KEY")
+    base_url = os.getenv("BASE_URL")
+    if not api_key:
+        print(
+            "Please set LLM_API_KEY environment variable or modify this script to pass api_key explicitly"
+        )
+        return
+
+    generator = KernelGenerator(
+        model_name=model_name,
+        language=language,
+        target_gpu=target_gpu,
+        api_key=api_key,
+        base_url=base_url,
+        reasoning_effort="high",
+    )
+
+    print(f"\n{'='*60}")
+    print(f"Generating CUDA solution for: {definition_name}")
+    print(f"Definition type: {definition.op_type}")
+    print(f"Target GPU: {target_gpu}")
+    print(f"{'='*60}")
+
+    # Get workloads for this definition
+    workloads = traceset.workloads.get(definition_name, [])
+    if not workloads:
+        print(f"No workloads found for definition '{definition_name}'")
+        return
+
+    print(f"Found {len(workloads)} workloads for this definition")
+
+    # Generate solution with beam search
+    solution = None
+    max_attempts = 2
+
+    for attempt in range(1, max_attempts + 1):
+        try:
+            print(f"\nAttempt {attempt}/{max_attempts}")
+
+            solution = generator.generate(
+                traceset=traceset,
+                definition=definition,
+                gen_rounds=10,  # search depth
+                beam=True,
+                beam_width=3,
+            )
+
+            print(f"Successfully generated solution for {definition_name}")
+            break
+
+        except Exception as e:
+            print(f"Attempt {attempt} failed: {e}")
+            if attempt < max_attempts:
+                print(f"Retrying... ({attempt + 1}/{max_attempts})")
+            else:
+                print(f"All attempts failed - aborting")
+                return
+
+    # Save the solution
+    if solution:
+        try:
+            solutions_dir = Path(traceset_path) / "solutions"
+            solutions_dir.mkdir(parents=True, exist_ok=True)
+
+            solution_filename = f"{solution.name}.json"
+            solution_path = solutions_dir / solution_filename
+
+            save_json_file(solution, solution_path)
+
+            print(f"\n{'='*60}")
+            print(f"SUCCESS!")
+            print(f"{'='*60}")
+            print(f"Solution saved to: {solution_path}")
+
+        except Exception as e:
+            print(f"Failed to save solution: {e}")
+            return
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/ffi/pytorch_example.py b/examples/ffi/pytorch_example.py
index dbc46a7..1a2eced 100644
--- a/examples/ffi/pytorch_example.py
+++ b/examples/ffi/pytorch_example.py
@@ -2,48 +2,51 @@
 PyTorch example: Load and run the distributed .so kernel using tvm-ffi.
 """
 
+from pathlib import Path
+
 import torch
 import tvm_ffi
-from pathlib import Path
+
 
 def main():
     dist_dir = Path("distributed")
     so_path = dist_dir / "kernel.so"
-    
+
     entry_symbol = None
-    for line in (dist_dir / "kernel_metadata.txt").read_text().split('\n'):
+    for line in (dist_dir / "kernel_metadata.txt").read_text().split("\n"):
         if line.startswith("Entry Symbol:"):
             entry_symbol = line.split(":", 1)[1].strip()
             break
-    
+
     if entry_symbol is None:
         raise ValueError("Entry symbol not found in metadata")
-    
+
     mod = tvm_ffi.load_module(str(so_path))
     kernel_fn = getattr(mod, entry_symbol)
-    
+
     print(f"Loaded kernel: {entry_symbol}")
-    
+
     # Prepare inputs: C = A @ B.T
     M, N, K = 1024, 4096, 4096
-    
+
     torch.manual_seed(0)
     A = torch.randn(M, K, dtype=torch.float16, device="cuda")
-    
+
     torch.manual_seed(1)
     B = torch.randn(N, K, dtype=torch.float16, device="cuda")
-    
+
     C = torch.empty(M, N, dtype=torch.float16, device="cuda")
-    
+
     # Run kernel: C = A @ B.T
     kernel_fn(A, B, C)
-    
+
     # Verify against PyTorch reference
     reference = torch.matmul(A, B.T)
     max_diff = torch.abs(C - reference).max().item()
-    
+
     print(f"Max diff vs reference: {max_diff:.6f}")
     print(f"Correct: {max_diff < 1e-2}")
 
+
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From a0027ab9a3541b2f884b4ec94f35cbd91267b9cf Mon Sep 17 00:00:00 2001
From: zanderjiang <zander.jiang@gmail.com>
Date: Tue, 2 Dec 2025 23:36:48 -0500
Subject: [PATCH 10/10] remove unnecessary tests

---
 tests/agent/load_inline.py              |  48 -----
 tests/agent/test_existing_prompt_sol.py | 226 ------------------------
 2 files changed, 274 deletions(-)
 delete mode 100644 tests/agent/load_inline.py
 delete mode 100644 tests/agent/test_existing_prompt_sol.py

diff --git a/tests/agent/load_inline.py b/tests/agent/load_inline.py
deleted file mode 100644
index 677b6a4..0000000
--- a/tests/agent/load_inline.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import torch
-import tvm_ffi.cpp
-from tvm_ffi import Module
-
-cuda_source = """
-// File: compile/add_one_cuda.cu
-#include <tvm/ffi/container/tensor.h>
-#include <tvm/ffi/extra/c_env_api.h>
-#include <tvm/ffi/function.h>
-
-namespace tvm_ffi_example_cuda {
-
-__global__ void AddOneKernel(float* x, float* y, int n) {
-  int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (idx < n) {
-    y[idx] = x[idx] + 1;
-  }
-}
-
-void AddOne(tvm::ffi::TensorView x, tvm::ffi::TensorView y) {
-  int64_t n = x.size(0);
-  float* x_data = static_cast<float*>(x.data_ptr());
-  float* y_data = static_cast<float*>(y.data_ptr());
-  int64_t threads = 256;
-  int64_t blocks = (n + threads - 1) / threads;
-  cudaStream_t stream =
-      static_cast<cudaStream_t>(TVMFFIEnvGetStream(x.device().device_type, x.device().device_id));
-  AddOneKernel<<<blocks, threads, 0, stream>>>(x_data, y_data, n);
-}
-
-TVM_FFI_DLL_EXPORT_TYPED_FUNC(add_one_cuda, tvm_ffi_example_cuda::AddOne);
-}  // namespace tvm_ffi_example_cuda
-"""
-
-
-def main():
-    mod: Module = tvm_ffi.cpp.load_inline(name="add_one_cuda", cuda_sources=cuda_source)
-    print("Compilation successful")
-
-    x = torch.randn(8).cuda()
-    y = torch.randn(8).cuda()
-    print(x)
-    mod.add_one_cuda(x, y)
-    print(y)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/tests/agent/test_existing_prompt_sol.py b/tests/agent/test_existing_prompt_sol.py
deleted file mode 100644
index 12a7283..0000000
--- a/tests/agent/test_existing_prompt_sol.py
+++ /dev/null
@@ -1,226 +0,0 @@
-import re
-from pathlib import Path
-
-import torch
-import tvm_ffi.cpp
-from tvm_ffi import Module
-
-
-def extract_code_from_file(file_path: Path) -> tuple[str, str]:
-    with open(file_path, "r") as f:
-        content = f.read()
-
-    model_match = re.search(r"Model: (.+?)\n", content)
-    model_name = model_match.group(1) if model_match else file_path.stem
-
-    # Extract code between "Generated Code:" and "Test Results:"
-    code_pattern = r"Generated Code:\n={80}\n(.*?)\n(?:Test Results:|$)"
-    code_match = re.search(code_pattern, content, re.DOTALL)
-
-    if code_match:
-        code = code_match.group(1).strip()
-    else:
-        # Try alternative pattern - just get everything after "Generated Code:"
-        alt_pattern = r"Generated Code:\n={80}\n(.*)"
-        alt_match = re.search(alt_pattern, content, re.DOTALL)
-        if alt_match:
-            # Get everything until we see "Test Results:" or end of file
-            full_content = alt_match.group(1)
-            if "Test Results:" in full_content:
-                code = full_content.split("Test Results:")[0].strip()
-            else:
-                code = full_content.strip()
-        else:
-            raise ValueError(f"Could not extract code from {file_path}")
-
-    return model_name, code
-
-
-def test_kernel(mod: Module, test_name: str) -> bool:
-    """Test the generated kernel with simple test cases."""
-    try:
-        print(f"\n  Running {test_name}...")
-
-        if test_name == "test_small":
-            # Test 1: Small tensor
-            a = torch.tensor([1.0, 2.0, 3.0, 4.0, 5.0], dtype=torch.float32, device="cuda")
-            b = torch.tensor([10.0, 20.0, 30.0, 40.0, 50.0], dtype=torch.float32, device="cuda")
-            c = torch.empty_like(a)
-
-            mod.elementwise_add(a, b, c)
-            expected = a + b
-
-            torch.testing.assert_close(c, expected)
-            print(f"  ✓ {test_name} passed")
-            return True
-
-        elif test_name == "test_large":
-            # Test 2: Larger tensor
-            n = 10000
-            a = torch.randn(n, dtype=torch.float32, device="cuda")
-            b = torch.randn(n, dtype=torch.float32, device="cuda")
-            c = torch.empty_like(a)
-
-            mod.elementwise_add(a, b, c)
-            expected = a + b
-
-            torch.testing.assert_close(c, expected)
-            print(f"  ✓ {test_name} passed")
-            return True
-
-    except Exception as e:
-        print(f"  ✗ {test_name} failed: {e}")
-        return False
-
-
-def update_test_results_in_file(file_path: Path, model_name: str, code: str, results: dict):
-    """Update the test results section in a result file, overwriting old results."""
-    # Build the new content
-    content = f"Model: {model_name}\n"
-    content += "=" * 80 + "\n\n"
-    content += "Generated Code:\n"
-    content += "=" * 80 + "\n"
-    content += code + "\n\n"
-    content += "Test Results:\n"
-    content += "=" * 80 + "\n"
-    content += f"Compilation: {results['compilation']}\n"
-
-    if "error" in results:
-        content += f"Error: {results['error']}\n"
-    else:
-        content += f"Test 1 (small tensor): {results['test_small']}\n"
-        content += f"Test 2 (large tensor): {results['test_large']}\n"
-
-    # Write back to file
-    with open(file_path, "w") as f:
-        f.write(content)
-
-
-def test_saved_code(file_path: Path):
-    """Test code from a saved result file."""
-    print(f"\n{'='*80}")
-    print(f"Testing: {file_path.name}")
-    print(f"{'='*80}")
-
-    try:
-        # Extract code
-        model_name, cuda_code = extract_code_from_file(file_path)
-        print(f"Model: {model_name}")
-        print(f"Extracted {len(cuda_code)} characters of code")
-
-        # Try to compile
-        print("Attempting to compile with tvm_ffi...")
-        try:
-            mod: Module = tvm_ffi.cpp.load_inline(
-                name=f'elementwise_add_{model_name.replace("-", "_")}', cuda_sources=cuda_code
-            )
-            print("✓ Compilation successful!")
-
-            # Run tests
-            print("\nRunning tests...")
-            test1_passed = test_kernel(mod, "test_small")
-            test2_passed = test_kernel(mod, "test_large")
-
-            all_passed = test1_passed and test2_passed
-            status = "✓ ALL TESTS PASSED" if all_passed else "✗ SOME TESTS FAILED"
-            print(f"\n{status}")
-
-            # Update the file with new test results
-            update_test_results_in_file(
-                file_path,
-                model_name,
-                cuda_code,
-                {
-                    "compilation": "SUCCESS",
-                    "test_small": "PASS" if test1_passed else "FAIL",
-                    "test_large": "PASS" if test2_passed else "FAIL",
-                },
-            )
-
-            return {
-                "file": file_path.name,
-                "model": model_name,
-                "compilation": "success",
-                "test_small": test1_passed,
-                "test_large": test2_passed,
-                "all_passed": all_passed,
-            }
-
-        except Exception as e:
-            print(f"✗ Compilation failed: {e}")
-
-            # Update the file with compilation failure
-            update_test_results_in_file(
-                file_path, model_name, cuda_code, {"compilation": "FAILED", "error": str(e)}
-            )
-
-            return {
-                "file": file_path.name,
-                "model": model_name,
-                "compilation": "failed",
-                "error": str(e),
-                "all_passed": False,
-            }
-
-    except Exception as e:
-        print(f"✗ Error processing file: {e}")
-        return {
-            "file": file_path.name,
-            "compilation": "error",
-            "error": str(e),
-            "all_passed": False,
-        }
-
-
-def main():
-    """Test all saved code from test_results directory."""
-
-    # Get test_results directory
-    test_results_dir = Path(__file__).parent / "test_results"
-
-    if not test_results_dir.exists():
-        print(f"Error: {test_results_dir} does not exist")
-        return
-
-    # Find all .txt files
-    txt_files = sorted(test_results_dir.glob("*.txt"))
-
-    if not txt_files:
-        print(f"No .txt files found in {test_results_dir}")
-        return
-
-    print("=" * 80)
-    print("Re-testing Saved LLM Generated Code")
-    print("=" * 80)
-    print(f"Files to test: {len(txt_files)}")
-    print(f"Directory: {test_results_dir}")
-
-    results = []
-
-    for idx, file_path in enumerate(txt_files, 1):
-        print(f"\n[{idx}/{len(txt_files)}] Testing {file_path.name}...")
-        result = test_saved_code(file_path)
-        results.append(result)
-
-    # Print summary
-    print(f"\n{'='*80}")
-    print("FINAL SUMMARY")
-    print(f"{'='*80}")
-
-    for result in results:
-        file_name = result["file"]
-        status = "✓ PASS" if result.get("all_passed", False) else "✗ FAIL"
-        print(f"{file_name:40} {status}")
-        if "error" in result:
-            error_msg = result["error"]
-            # Truncate long errors
-            if len(error_msg) > 100:
-                error_msg = error_msg[:100] + "..."
-            print(f"  Error: {error_msg}")
-
-    total_passed = sum(1 for r in results if r.get("all_passed", False))
-    print(f"\nTotal: {total_passed}/{len(results)} files passed all tests")
-
-
-if __name__ == "__main__":
-    main()