Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 35 additions & 2 deletions benchmark/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -515,7 +515,8 @@ def load_results(results_dir, stats_languages=None):
results["test_cases_total"] = total_cases
if passed_cases is not None:
results["test_cases_passed"] = passed_cases

if passed_cases is not None and total_cases is not None:
results["test_cases_pass_ratio"] = passed_cases / total_cases
# Update the JSON file immediately to keep data fresh
fname.write_text(json.dumps(results, indent=4))
logger.debug(f"Updated {fname} with test case counts")
Expand Down Expand Up @@ -583,7 +584,8 @@ def summarize_results(results_dir, verbose, stats_languages=None):
res.completion_tokens = 0
res.test_cases_total = 0
res.test_cases_passed = 0

res.test_cases_pass_ratio_sum = 0.0
res.test_cases_pass_ratio_count = 0
res.reasoning_effort = None
res.thinking_tokens = None
res.map_tokens = None
Expand Down Expand Up @@ -621,6 +623,8 @@ def add(attr_name, increment, global_stats, lang_stats):
lang_stats.completion_tokens = 0
lang_stats.test_cases_total = 0
lang_stats.test_cases_passed = 0
lang_stats.test_cases_pass_ratio_sum = 0.0
lang_stats.test_cases_pass_ratio_count = 0
lang_to_stats[lang] = lang_stats
lang_to_passed_tests[lang] = [0] * tries

Expand Down Expand Up @@ -682,6 +686,11 @@ def add(attr_name, increment, global_stats, lang_stats):
if passed_cases is not None:
add("test_cases_passed", passed_cases, res, lang_stats)

pass_ratio = results.get("test_cases_pass_ratio")
if pass_ratio is not None:
add("test_cases_pass_ratio_sum", pass_ratio, res, lang_stats)
add("test_cases_pass_ratio_count", 1, res, lang_stats)

res.reasoning_effort = results.get("reasoning_effort")
res.thinking_tokens = results.get("thinking_tokens")
res.map_tokens = results.get("map_tokens")
Expand Down Expand Up @@ -764,6 +773,12 @@ def show(stat, red="red"):
res.test_cases_percentage = 100 * res.test_cases_passed / res.test_cases_total
print(f" test_cases_percentage: {res.test_cases_percentage:.1f}")

if res.test_cases_pass_ratio_count > 0:
res.test_cases_pass_ratio_avg = (
100 * res.test_cases_pass_ratio_sum / res.test_cases_pass_ratio_count
)
print(f" test_cases_pass_ratio_avg: {res.test_cases_pass_ratio_avg:.1f}")

if variants["model"]:
a_model = set(variants["model"]).pop()
command = f"cecli --model {a_model}"
Expand Down Expand Up @@ -801,6 +816,18 @@ def format_lang_stats(lang, lang_stats):
pass_rate = 100 * num_passed / float(lang_stats.completed_tests)
setattr(lang_stats, f"pass_rate_{i + 1}", pass_rate)

if lang_stats.test_cases_pass_ratio_count > 0:
lang_stats.test_cases_pass_ratio_avg = (
100
* lang_stats.test_cases_pass_ratio_sum
/ lang_stats.test_cases_pass_ratio_count
)
else:
lang_stats.test_cases_pass_ratio_avg = 0.0

del lang_stats.test_cases_pass_ratio_sum
del lang_stats.test_cases_pass_ratio_count

# Then format attributes into ready-to-print strings
for attr in lang_stats.__dict__:
val = getattr(lang_stats, attr)
Expand Down Expand Up @@ -870,6 +897,11 @@ def compute_lang_to_col_widths(lang_to_stats):
console.rule()

# print(json.dumps(vars(res), indent=4, sort_keys=True))
if hasattr(res, "test_cases_pass_ratio_sum"):
del res.test_cases_pass_ratio_sum
if hasattr(res, "test_cases_pass_ratio_count"):
del res.test_cases_pass_ratio_count

return res


Expand Down Expand Up @@ -1156,6 +1188,7 @@ async def run_test_real(
verbose=verbose,
yes_always_commands=True,
max_reflections=0,
file_diffs=False,
),
map_mul_no_files=4,
mcp_manager=None,
Expand Down
203 changes: 203 additions & 0 deletions benchmark/variations.1.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
#!/bin/bash
# Benchmark runner script for testing multiple OpenRouter models
# Usage: ./run_benchmark_variations.sh [OPTIONS]

set -e # Exit on error

# Default values
BASE_NAME="cecli-little-guys-h6"
EDIT_FORMAT="hashline"
MAP_TOKENS="512"
THREADS="1"
LANGUAGES="javascript,python,rust,go,java"
HASH_RE="^.[15ef]"
NUM_TESTS="72"
EXERCISES_DIR="polyglot-benchmark"
OUTPUT_DIR="tmp.benchmarks"
SLEEP_BETWEEN=30 # Seconds to sleep between runs

# List of models to test
# RERUN
# "openrouter/minimax/minimax-m2.1"
# "openrouter/qwen/qwen3-vl-235b-a22b-thinking"
MODELS=(
# "openrouter/qwen/qwen3.5-35b-a3b"
# "openrouter/xiaomi/mimo-v2-flash"
# "openrouter/moonshotai/kimi-k2.5"
# "openrouter/minimax/minimax-m2.5"
"openrouter/anthropic/claude-haiku-4.5"
"openrouter/openai/gpt-oss-120b"
"openrouter/openai/gpt-5-mini"
"openrouter/google/gemini-3-flash-preview"
"openrouter/deepseek/deepseek-v3.2-exp"
)

# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--base-name)
BASE_NAME="$2"
shift 2
;;
--edit-format)
EDIT_FORMAT="$2"
shift 2
;;
--map-tokens)
MAP_TOKENS="$2"
shift 2
;;
--threads)
THREADS="$2"
shift 2
;;
--hash-re)
HASH_RE="$2"
shift 2
;;
--num-tests)
NUM_TESTS="$2"
shift 2
;;
--exercises-dir)
EXERCISES_DIR="$2"
shift 2
;;
--output-dir)
OUTPUT_DIR="$2"
shift 2
;;
--sleep)
SLEEP_BETWEEN="$2"
shift 2
;;
--help)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Options:"
echo " --base-name NAME Base name for benchmark runs (default: $BASE_NAME)"
echo " --edit-format FORMAT Edit format to use (default: $EDIT_FORMAT)"
echo " --map-tokens TOKENS Map tokens (default: $MAP_TOKENS)"
echo " --threads N Number of threads (default: $THREADS)"
echo " --hash-re REGEX Hash regex filter (default: $HASH_RE)"
echo " --num-tests N Number of tests to run (default: $NUM_TESTS)"
echo " --exercises-dir DIR Exercises directory (default: $EXERCISES_DIR)"
echo " --output-dir DIR Output directory (default: $OUTPUT_DIR)"
echo " --sleep SECONDS Sleep between runs in seconds (default: $SLEEP_BETWEEN)"
echo " --help Show this help message"
echo ""
echo "Example:"
echo " $0 --threads 2 --num-tests 5"
exit 0
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information"
exit 1
;;
esac
done

# Function to run a single benchmark
run_benchmark() {
local model="$1"
local run_name="$2"

echo "========================================================================"
echo "Starting benchmark: $run_name"
echo "Model: $model"
echo "Time: $(date)"
echo "========================================================================"

# Create the benchmark command
./benchmark/benchmark.py "$run_name" \
--new \
--model "$model" \
--edit-format "$EDIT_FORMAT" \
--map-tokens "$MAP_TOKENS" \
--threads "$THREADS" \
--hash-re "$HASH_RE" \
--num-tests "$NUM_TESTS" \
--languages "$LANGUAGES" \
--tries 2 \
--exercises-dir "$EXERCISES_DIR"

echo "Benchmark completed: $run_name"
echo "Results directory: $OUTPUT_DIR/$(ls -t $OUTPUT_DIR | grep "$run_name" | head -1)"
echo ""
}

# Function to generate statistics for all completed runs
generate_stats() {
echo "========================================================================"
echo "Generating statistics for all completed runs"
echo "========================================================================"

for dir in "$OUTPUT_DIR"/*; do
if [ -d "$dir" ] && [ -f "$dir/.cecli.results.json" ]; then
echo "Processing: $(basename "$dir")"
./benchmark/benchmark.py --stats "$dir" || true
echo ""
fi
done
}

# Main execution
main() {
echo "========================================================================"
echo "OpenRouter Model Benchmark Runner"
echo "========================================================================"
echo "Configuration:"
echo " Base name: $BASE_NAME"
echo " Edit format: $EDIT_FORMAT"
echo " Map tokens: $MAP_TOKENS"
echo " Threads: $THREADS"
echo " Hash regex: $HASH_RE"
echo " Num tests: $NUM_TESTS"
echo " Exercises dir: $EXERCISES_DIR"
echo " Output dir: $OUTPUT_DIR"
echo " Sleep between: ${SLEEP_BETWEEN}s"
echo " Models to test: ${#MODELS[@]}"
echo ""

# Create output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"

# Run benchmarks for each model
for model in "${MODELS[@]}"; do
# Create a run name by replacing slashes with hyphens
local model_slug=$(echo "$model" | sed 's|/|-|g')
local run_name="${BASE_NAME}-${model_slug}"

run_benchmark "$model" "$run_name"

# Sleep between runs to avoid rate limiting
if [ "$SLEEP_BETWEEN" -gt 0 ]; then
echo "Sleeping for ${SLEEP_BETWEEN} seconds before next run..."
sleep "$SLEEP_BETWEEN"
echo ""
fi
done

# Generate statistics
generate_stats

echo "========================================================================"
echo "All benchmarks completed!"
echo "========================================================================"
echo ""
echo "Summary of results directories:"
ls -la "$OUTPUT_DIR" | grep "$BASE_NAME"
echo ""
echo "To view statistics for a specific run:"
echo " ./benchmark/benchmark.py --stats $OUTPUT_DIR/<run-directory>"
echo ""
echo "To compare all results:"
echo " for dir in $OUTPUT_DIR/*$BASE_NAME*; do"
echo " echo \"=== \$(basename \$dir) ===\""
echo " ./benchmark/benchmark.py --stats \"\$dir\" 2>/dev/null | grep -E '(pass_rate|total_cost|completed_tests)' || true"
echo " done"
}

# Run main function
main
Loading
Loading