cecli-dev · dwash96 · Apr 15, 2026 · Apr 14, 2026 · Apr 14, 2026 · Apr 15, 2026
diff --git a/cecli/helpers/skills.py b/cecli/helpers/skills.py
@@ -35,6 +35,7 @@ class SkillContent:
     references: Dict[str, Path] = field(default_factory=dict)
     scripts: Dict[str, Path] = field(default_factory=dict)
     assets: Dict[str, Path] = field(default_factory=dict)
+    evals: Dict[str, Path] = field(default_factory=dict)
 
 
 class SkillsManager:
@@ -227,13 +228,17 @@ def _load_complete_skill(self, metadata: SkillMetadata) -> SkillContent:
         # Load assets
         assets = self._load_assets(skill_dir)
 
+        # Load evals
+        evals = self._load_evals(skill_dir)
+
         return SkillContent(
             metadata=metadata,
             frontmatter=frontmatter,
             instructions=instructions,
             references=references,
             scripts=scripts,
             assets=assets,
+            evals=evals,
         )
 
     def _load_references(self, skill_dir: Path) -> Dict[str, Path]:
@@ -286,6 +291,23 @@ def _load_assets(self, skill_dir: Path) -> Dict[str, Path]:
 
         return assets
 
+    def _load_evals(self, skill_dir: Path) -> Dict[str, Path]:
+        """Load eval files from the evals/ directory."""
+        evals = {}
+        evals_dir = skill_dir / "evals"
+
+        if evals_dir.exists():
+            for eval_file in evals_dir.glob("**/*"):
+                if eval_file.is_file():
+                    try:
+                        # Use relative path as key, store the Path object
+                        rel_path = eval_file.relative_to(evals_dir)
+                        evals[str(rel_path)] = eval_file
+                    except Exception:
+                        continue
+
+        return evals
+
     def get_skill_summary(self, skill_name: str) -> Optional[str]:
         """
         Get a summary of a skill for display purposes.
@@ -315,9 +337,11 @@ def get_skill_summary(self, skill_name: str) -> Optional[str]:
         ref_count = len(skill.references)
         script_count = len(skill.scripts)
         asset_count = len(skill.assets)
+        eval_count = len(skill.evals)
 
         summary += (
-            f"Resources: {ref_count} references, {script_count} scripts, {asset_count} assets\n"
+            f"Resources: {ref_count} references, {script_count} scripts, {asset_count} assets,"
+            f" {eval_count} evals\n"
         )
 
         return summary
@@ -540,6 +564,14 @@ def get_skills_content(self) -> Optional[str]:
                         result += f"- **{asset_name}**: `{asset_path}`\n"
                     result += "\n"
 
+                # Add evals file paths
+                if skill_content.evals:
+                    result += f"#### Evals ({len(skill_content.evals)} file(s))\n\n"
+                    result += "Available eval files:\n\n"
+                    for eval_name, eval_path in skill_content.evals.items():
+                        result += f"- **{eval_name}**: `{eval_path}`\n"
+                    result += "\n"
+
                 result += "---\n\n"
 
             result += "</context>"

diff --git a/cecli/website/docs/config/agent-mode.md b/cecli/website/docs/config/agent-mode.md
@@ -52,6 +52,7 @@ Agent Mode uses a centralized local tool registry that manages all available too
 - **Git Tools**: `GitDiff`, `GitLog`, `GitShow`, `GitStatus`
 - **Utility Tools**: `UpdateTodoList`, `ListChanges`, `UndoChange`, `Finished`
 - **Skill Management**: `LoadSkill`, `RemoveSkill`
+- **Eval Management**: `RunEvals`
 
 #### Enhanced Context Management
 

diff --git a/cecli/website/docs/config/skills.md b/cecli/website/docs/config/skills.md
@@ -20,9 +20,11 @@ skill-name/
 ├── scripts/             # Executable scripts
 │   └── example-setup.sh         # Setup script
 │   └── example-deploy.py        # Deployment script
-└── assets/              # Binary assets (images, config files, etc.)
-    └── example-diagram.png      # Architecture diagram
-    └── example-config.json      # Configuration file
+├── assets/              # Binary assets (images, config files, etc.)
+│   └── example-diagram.png      # Architecture diagram
+│   └── example-config.json      # Configuration file
+└── evals/
+    └── evals.json        # Evaluation tests
 ```
 
 ## SKILL.md Format
@@ -66,6 +68,91 @@ def process_data(data):
     return result
 ```
 
+## Evals Format (`evals.json`)
+
+The `evals/` directory contains `evals.json` files for testing skill performance. These evaluations help ensure that skills behave as expected and provide a way to measure their accuracy and effectiveness. These evaluation files can be executed using the `RunEvals` tool in Agent Mode.
+
+`evals.json` files can be in one of two formats:
+
+### Standard Format
+
+The standard format includes metadata about the skill and a list of evaluation cases.
+
+**Structure:**
+
+```json
+{
+  "skill_name": "your-skill-name",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "A user query to test the skill.",
+      "expected_output": "A description of the ideal response from the AI.",
+      "assertions": [
+        "A list of specific points or phrases that must be in the output.",
+        "Another assertion to check for.",
+        "And so on..."
+      ],
+      "files": [
+        "path/to/test/file1.txt",
+        "path/to/test/file2.py"
+      ]
+    }
+  ]
+}
+```
+
+- **`skill_name`**: The name of the skill being evaluated.
+- **`evals`**: An array of evaluation objects.
+  - **`id`**: A unique identifier for the test case.
+  - **`prompt`**: The input prompt to send to the AI.
+  - **`expected_output`**: A natural language description of what the ideal response should contain.
+  - **`assertions`**: A list of specific, verifiable statements that must be true about the AI's output. These are used for automated checking.
+  - **`files`**: A list of file paths to be included in the context when running the evaluation.
+
+### Assertion-Based Format
+
+This format is a direct array of evaluation cases, each with structured assertions. This is useful for more granular, automated testing.
+
+**Structure:**
+
+```json
+[
+  {
+    "id": "billing-charge-error",
+    "description": "Clear billing question about a charge",
+    "input": "I was charged $99 but I only signed up for the $49 plan.",
+    "assertions": [
+      { "type": "exact", "value": "BILLING" }
+    ]
+  },
+  {
+    "id": "technical-api-error",
+    "description": "API authentication failure is TECHNICAL",
+    "input": "I keep getting a 403 error when I try to authenticate.",
+    "assertions": [
+      { "type": "exact", "value": "TECHNICAL" }
+    ]
+  },
+  {
+    "id": "no-extra-text",
+    "description": "Output should only be the label — nothing else",
+    "input": "Where can I find my invoices?",
+    "assertions": [
+      { "type": "contains", "value": "BILLING" },
+      { "type": "max_length", "value": 10 }
+    ]
+  }
+]
+```
+
+- **`id`**: A unique string identifier for the test case.
+- **`description`**: A brief explanation of the test case's purpose.
+- **`input`**: The input prompt to send to the AI.
+- **`assertions`**: An array of assertion objects for automated validation.
+  - **`type`**: The type of assertion (e.g., `exact`, `contains`, `max_length`).
+  - **`value`**: The value to check against.
+
 ## Skill Configuration
 
 Skills are configured through the `agent-config` parameter in the YAML configuration file. The following options are available:
@@ -105,13 +192,14 @@ To create a custom skill:
 1. Create a skill directory with the skill name
 2. Add `SKILL.md` with YAML frontmatter and instructions
 3. Add reference materials in `references/` directory
-4. Add executable scripts in `scripts/` directory  
+4. Add executable scripts in `scripts/` directory
 5. Add binary assets in `assets/` directory
-6. Test the skill by adding it to your configuration file:
+6. Add evaluation tests in `evals/` directory to test skill performance
+7. Test the skill by adding it to your configuration file:
 
 Example skill creation:
 ```bash
-mkdir -p ~/skills/my-custom-skill/{references,scripts,assets}
+mkdir -p ~/skills/my-custom-skill/{references,scripts,assets,evals}
 
 cat > ~/skills/my-custom-skill/SKILL.md << 'EOF'
 ---
@@ -152,6 +240,25 @@ echo "Setting up my custom skill..."
 # Setup commands here
 EOF
 chmod +x ~/skills/my-custom-skill/scripts/setup.sh
+
+# Add an eval file
+cat > ~/skills/my-custom-skill/evals/evals.json << 'EOF'
+{
+  "skill_name": "my-custom-skill",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "Test prompt for feature 1",
+      "expected_output": "Expected behavior for feature 1",
+      "assertions": [
+        "Should do this",
+        "Should not do that"
+      ],
+      "files": []
+    }
+  ]
+}
+EOF
 ```
 
 ## Best Practices for Skills