Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 12 additions & 1 deletion docs/source/cli-skills.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,11 @@
### kernels skills add

Use `kernels skills add` to install the skills for AI coding assistants like Claude, Codex, and OpenCode. For now, only the `cuda-kernels` skill is supported. Skill files are downloaded from the `huggingface/kernels` directory in this [repository](https://github.com/huggingface/kernels/tree/main/skills).
Use `kernels skills add` to install the skills for AI coding assistants like Claude, Codex, and OpenCode.
Supported skills include:
- `cuda-kernels` (default)
- `rocm-kernels`

Skill files are downloaded from the `huggingface/kernels` directory in this [repository](https://github.com/huggingface/kernels/tree/main/skills).

Skills instruct agents how to deal with hardware-specific optimizations, integrate with libraries like diffusers and transformers, and benchmark kernel performance in consistent ways.

Expand All @@ -10,6 +15,9 @@ Examples:
# install for Claude in the current project
kernels skills add --claude

# install ROCm kernels skill for Codex
kernels skills add --skill rocm-kernels --codex

# install globally for Codex
kernels skills add --codex --global

Expand All @@ -18,6 +26,9 @@ kernels skills add --claude --codex --opencode

# install to a custom destination and overwrite if already present
kernels skills add --dest ~/my-skills --force
```

### Create a new kernel project
```bash
kernels init my-username/my-kernel --skills ~/my-skills
```
14 changes: 12 additions & 2 deletions kernels/src/kernels/cli/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
from pathlib import Path

from kernels.cli.skills import add_skill
from kernels.cli.skills import DEFAULT_SKILL_ID, SUPPORTED_SKILL_IDS, add_skill
from kernels.cli.upload import upload_kernels_dir
from kernels.cli.versions import print_kernel_versions
from kernels.compat import tomllib
Expand Down Expand Up @@ -98,7 +98,17 @@ def main():
skills_subparsers = skills_parser.add_subparsers(required=True)
skills_add_parser = skills_subparsers.add_parser(
"add",
help="Install the cuda-kernels skill for an AI assistant",
help="Install a kernels skill for an AI assistant",
)
skills_add_parser.add_argument(
"--skill",
type=str,
choices=SUPPORTED_SKILL_IDS,
default=DEFAULT_SKILL_ID,
help=(
f"Skill ID to install. Defaults to `{DEFAULT_SKILL_ID}`. "
"Use `rocm-kernels` for ROCm-focused kernels."
),
)
skills_add_parser.add_argument(
"--claude",
Expand Down
49 changes: 33 additions & 16 deletions kernels/src/kernels/cli/skills.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@
from huggingface_hub.utils import get_session

DEFAULT_SKILL_ID = "cuda-kernels"
_GITHUB_RAW_BASE = (
"https://raw.githubusercontent.com/huggingface/kernels/main/" "skills/cuda-kernels"
SUPPORTED_SKILL_IDS = ("cuda-kernels", "rocm-kernels")
_GITHUB_RAW_BASE_TEMPLATE = (
"https://raw.githubusercontent.com/huggingface/kernels/main/skills/{skill_id}"
)
_MANIFEST_URL = f"{_GITHUB_RAW_BASE}/manifest.txt"
_LOCAL_SKILLS_ROOT = Path(__file__).resolve().parents[4] / "skills" / "cuda-kernels"
_LOCAL_SKILLS_DIR = Path(__file__).resolve().parents[4] / "skills"

GLOBAL_TARGETS = {
"codex": Path("~/.codex/skills"),
Expand All @@ -31,12 +31,24 @@ def _download(url: str) -> str:
return response.text


def _download_manifest() -> list[str]:
def _github_raw_base(skill_id: str) -> str:
return _GITHUB_RAW_BASE_TEMPLATE.format(skill_id=skill_id)


def _manifest_url(skill_id: str) -> str:
return f"{_github_raw_base(skill_id)}/manifest.txt"


def _local_skill_root(skill_id: str) -> Path:
return _LOCAL_SKILLS_DIR / skill_id


def _download_manifest(skill_id: str) -> list[str]:
entries: list[str] = []
try:
raw_manifest = _download(_MANIFEST_URL)
raw_manifest = _download(_manifest_url(skill_id))
except Exception:
local_manifest = _LOCAL_SKILLS_ROOT / "manifest.txt"
local_manifest = _local_skill_root(skill_id) / "manifest.txt"
if not local_manifest.exists():
raise
raw_manifest = local_manifest.read_text(encoding="utf-8")
Expand All @@ -49,11 +61,11 @@ def _download_manifest() -> list[str]:
return entries


def _download_file(rel_path: str) -> str:
def _download_file(skill_id: str, rel_path: str) -> str:
try:
return _download(f"{_GITHUB_RAW_BASE}/{rel_path}")
return _download(f"{_github_raw_base(skill_id)}/{rel_path}")
except Exception:
local_file = _LOCAL_SKILLS_ROOT / rel_path
local_file = _local_skill_root(skill_id) / rel_path
if local_file.exists():
return local_file.read_text(encoding="utf-8")
raise
Expand All @@ -67,10 +79,10 @@ def _remove_existing(path: Path) -> None:
shutil.rmtree(path)


def _install_to(target: Path, force: bool) -> Path:
def _install_to(target: Path, force: bool, skill_id: str) -> Path:
target = target.expanduser().resolve()
target.mkdir(parents=True, exist_ok=True)
dest = target / DEFAULT_SKILL_ID
dest = target / skill_id

if dest.exists():
if not force:
Expand All @@ -79,8 +91,8 @@ def _install_to(target: Path, force: bool) -> Path:
)
_remove_existing(dest)

for rel_path in _download_manifest():
content = _download_file(rel_path)
for rel_path in _download_manifest(skill_id):
content = _download_file(skill_id, rel_path)
output_file = dest / rel_path
output_file.parent.mkdir(parents=True, exist_ok=True)
output_file.write_text(content, encoding="utf-8")
Expand All @@ -89,6 +101,11 @@ def _install_to(target: Path, force: bool) -> Path:


def add_skill(args: Namespace) -> None:
skill_id = getattr(args, "skill", DEFAULT_SKILL_ID)
if skill_id not in SUPPORTED_SKILL_IDS:
supported = ", ".join(SUPPORTED_SKILL_IDS)
raise SystemExit(f"Unsupported skill '{skill_id}'. Supported skills: {supported}")

if not (args.claude or args.codex or args.opencode or args.dest):
print(
"Pick a destination via --claude, --codex, --opencode, or --dest.",
Expand All @@ -109,5 +126,5 @@ def add_skill(args: Namespace) -> None:
targets.append(args.dest)

for target in targets:
installed_path = _install_to(target, force=args.force)
print(f"Installed '{DEFAULT_SKILL_ID}' to {installed_path}")
installed_path = _install_to(target, force=args.force, skill_id=skill_id)
print(f"Installed '{skill_id}' to {installed_path}")
61 changes: 61 additions & 0 deletions skills/rocm-kernels/CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# Changelog

## v0.3 (2026-04-10)

### Added
- **Consolidated benchmark package**: `examples/ltx-video-benchmark/benchmark_results.json` now provides a single JSON artifact with formal R9700 benchmark runs and summary statistics.
- **Live harness benchmark trace**:
- `examples/ltx-video-benchmark/trace/opencode_live/opencode_trace_result.json`
- Supporting run output in `examples/ltx-video-benchmark/trace/opencode_live/results.json`
- **Unified examples documentation**: `examples/ltx-video-benchmark/README.md` is now the single reviewer-facing README for the examples package.
- **Dependency lock-in entrypoint**: `scripts/requirements.txt` is used as the canonical install file for benchmark/integration examples.

### Changed
- **E2E benchmark CLI/output semantics** (`scripts/benchmark_e2e.py`):
- Standardized output flag to `--output-dir`
- Updated docs and usage examples to match current examples layout
- Structured outputs under `examples/ltx-video-benchmark/` and `trace/*_live/`
- **Examples content structure**:
- Merged previous top-level examples README content into `examples/ltx-video-benchmark/README.md`
- Updated trace/result paths in docs to current live locations
- **Script requirement notes**:
- Updated benchmark/example script docstrings to install dependencies from `scripts/requirements.txt`

### Fixed
- Removed ambiguity between "harness replay" and "live benchmark execution" by recording explicit `live_benchmark: true` trace outputs with executed command/config.
- Aligned reviewer-facing materials to use one benchmark documentation entrypoint and one consolidated formal benchmark JSON.

### Removed
- Legacy split/duplicate examples documentation and outdated trace path references in the examples package.
- Codex-specific benchmark trace artifacts and non-essential video artifacts from the reviewer package.

## v0.2 (2026-03-12)

### Added
- **Transformers integration**: `references/transformers-integration.md` — LLaMA/Mistral/Qwen RMSNorm patching, Flash Attention 2, epsilon handling differences
- **Transformers injection script**: `scripts/transformers_injection_example.py` — minimal runnable example (~150 lines)
- **HuggingFace Kernels Hub integration**: `references/huggingface-kernels-integration.md` — `get_kernel`, `has_kernel`, publishing, ROCm compatibility notes
- **HuggingFace Kernels example script**: `scripts/huggingface_kernels_example.py` — Hub loading, benchmarking, model integration with fallback
- **GEMM template with XCD swizzle**: Template 5 in `kernel-templates.md` — full GEMM kernel with XCD swizzle for MI355X, L2 cache grouping, autotune configs, Python API, and benchmark
- **CHANGELOG.md**: Version tracking for skill iterations

### Fixed
- Broken cross-references: "Template 2" for GEMM → corrected to "Template 5" in `troubleshooting.md`, `kernelbench-classification.md`, and `skill-evaluation-methodology.md`
- R9700 Memory Bandwidth: filled in ~608 GB/s (was TBD) in SKILL.md

### Updated
- `SKILL.md` See Also section: added new integration guides, scripts, and Hub links
- `SKILL.md` argument-hint: added gemm, transformers, huggingface-kernels, get_kernel
- `manifest.txt`: added all new files

## v0.1 (2026-03-10)

### Added
- Initial skill with SKILL.md, 4 kernel templates (RMSNorm, RoPE 3D, GEGLU, AdaLN)
- MI355X and R9700 GPU optimization guides
- Diffusers integration guide (LTX-Video)
- Troubleshooting guide (14 ROCm-specific issues)
- Benchmark scripts: micro-benchmark (`benchmark_kernels.py`) and E2E (`benchmark_e2e.py`)
- LTX-Video injection example (`ltx_kernel_injection_example.py`)
- KernelBench classification and evaluation methodology docs
- Kernel-agent knowledge base
Loading