OpenLLM-France · Jeronymous · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025 · Oct 14, 2025
diff --git a/.github/release.yml b/.github/release.yml
@@ -5,7 +5,10 @@ changelog:
   categories:
     - title: New Features 🎉
       labels:
-        - feature/enhancement
+        - feature
+    - title: Enhancement ⚙️
+      labels:
+        - enhancement
     - title: Documentation 📚
       labels:
         - documentation

diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
@@ -9,7 +9,8 @@ on:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@main
+    if: github.repository == 'huggingface/lighteval'
+    uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3  # main
     with:
       commit_sha: ${{ github.sha }}
       package: lighteval

diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml
@@ -9,7 +9,8 @@ concurrency:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@main
+    if: github.repository == 'huggingface/lighteval'
+    uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3  # main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}
       pr_number: ${{ github.event.number }}

diff --git a/.github/workflows/doc-pr-upload.yml b/.github/workflows/doc-pr-upload.yml
@@ -8,9 +8,11 @@ on:
 
 jobs:
   build:
-    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@main
+    if: github.repository == 'huggingface/lighteval'
+    uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@9ad2de8582b56c017cb530c1165116d40433f1c6  # main
     with:
       package_name: lighteval
     secrets:
       hf_token: ${{ secrets.HF_DOC_BUILD_PUSH }}
-      comment_bot_token: ${{ secrets.COMMENT_BOT_TOKEN }}
+      comment_bot_app_id: ${{ secrets.COMMENT_BOT_APP_ID }}
+      comment_bot_secret_pem: ${{ secrets.COMMENT_BOT_SECRET_PEM }}
diff --git a/.github/workflows/pr_style_bot.yaml b/.github/workflows/pr_style_bot.yaml
@@ -0,0 +1,16 @@
+name: PR Style Bot
+
+on:
+  issue_comment:
+    types: [created]
+
+permissions:
+  pull-requests: write
+
+jobs:
+  style:
+    uses: huggingface/huggingface_hub/.github/workflows/style-bot-action.yml@e000c1c89c65aee188041723456ac3a479416d4c  # main
+    with:
+      python_quality_dependencies: "[quality]"
+    secrets:
+      bot_token: ${{ secrets.HF_STYLE_BOT_ACTION }}
diff --git a/.github/workflows/quality.yaml b/.github/workflows/quality.yaml
@@ -16,9 +16,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout code
-        uses: actions/checkout@v2
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       - name: Setup Python environment
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@e9aba2c848f5ebd159c070c61ea2c4e2b122355e  # v2
         with:
           python-version: '3.10'
       - name: Install dependencies

diff --git a/.github/workflows/slow_tests.yaml b/.github/workflows/slow_tests.yaml
@@ -25,21 +25,57 @@ jobs:
           fi
 
       - name: Checkout repository
-        uses: actions/checkout@v4
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
         with:
           lfs: true
 
       - name: Install uv
-        uses: astral-sh/setup-uv@v5
+        uses: astral-sh/setup-uv@d4b2f3b6ecc6e67c4457f6d3e41ec42d3d0fcb86  # v5
         with:
           enable-cache: true
 
       - name: Install the project
-        run: uv sync --extra dev
+        run: uv sync --extra dev-gpu
 
+      - name: Install Python development headers
+        run: sudo apt-get update && sudo apt-get install -y python3.12-dev
+
+      - name: Cache CUDA Toolkit
+        id: cache-cuda
+        uses: actions/cache@0057852bfaa89a56745cba8c7296529d2fc39830  # v4
+        with:
+          path: /usr/local/cuda-12.8
+          key: cuda-toolkit-12-8-${{ runner.os }}
+
+      - name: Install CUDA Toolkit
+        if: steps.cache-cuda.outputs.cache-hit != 'true'
+        run: |
+          # Add NVIDIA package repositories
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          # Install CUDA toolkit 12.8 to match nvidia-cuda-runtime-cu12==12.8.90
+          sudo apt-get install -y cuda-toolkit-12-8
+
+      - name: Verify CUDA installation
+        run: |
+          ls -la /usr/local/cuda-12.8/bin/nvcc || echo "WARNING: nvcc not found at /usr/local/cuda-12.8/bin/nvcc"
+          if [ -f /usr/local/cuda-12.8/bin/nvcc ]; then
+            /usr/local/cuda-12.8/bin/nvcc --version
+          fi
+
+      - name: Setup CUDA environment
+        run: |
+          export CUDA_HOME=/usr/local/cuda-12.8
+          export PATH="/usr/local/cuda-12.8/bin:$PATH"
+          echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
+          echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
 
       - name: run nvidia-smi
         run: nvidia-smi
 
       - name: Run tests
-        run: uv run pytest --disable-pytest-warnings --runslow tests/slow_tests/
+        run: |
+          export CUDA_HOME=/usr/local/cuda-12.8
+          export PATH="/usr/local/cuda-12.8/bin:$PATH"
+          uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
@@ -46,7 +46,7 @@ jobs:
           enable-cache: true
 
       - name: Install the project
-        run: uv sync --extra dev
+        run: uv sync --extra dev-gpu
 
       - name: Ensure cache directories exist
         run: mkdir -p cache/models cache/datasets

diff --git a/.github/workflows/trufflehog.yml b/.github/workflows/trufflehog.yml
@@ -11,10 +11,10 @@ jobs:
     runs-on: ubuntu-latest
     steps:
     - name: Checkout code
-      uses: actions/checkout@v4
+      uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd  # v6.0.2
       with:
         fetch-depth: 0
     - name: Secret Scanning
-      uses: trufflesecurity/trufflehog@main
+      uses: trufflesecurity/trufflehog@6bd2d14f7a4bc1e569fa3550efa7ec632a4fa67b  # main
       with:
         extra_args: --only-verified
diff --git a/.github/workflows/vllm_main_tests.yaml b/.github/workflows/vllm_main_tests.yaml
@@ -0,0 +1,79 @@
+name: vLLM Main Branch Tests
+
+on:
+  schedule:
+    - cron: '0 2 * * 1'  # Every Monday at 2 AM UTC
+  workflow_dispatch:
+
+permissions:
+  contents: read
+
+jobs:
+  test_vllm_main:
+    name: Test with vLLM main branch
+    runs-on: 'aws-g4dn-2xlarge-use1-public-80'
+    continue-on-error: true
+
+    steps:
+      - name: Install Git LFS
+        run: |
+          if ! command -v git-lfs &> /dev/null; then
+            sudo apt-get update && sudo apt-get install -y git-lfs
+            git lfs install
+          fi
+
+      - name: Checkout repository
+        uses: actions/checkout@v4
+        with:
+          lfs: true
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v5
+        with:
+          enable-cache: true
+
+      - name: Install the project
+        run: uv sync --extra dev-gpu
+
+      - name: Install Python development headers
+        run: sudo apt-get update && sudo apt-get install -y python3.12-dev
+
+      - name: Cache CUDA Toolkit
+        id: cache-cuda
+        uses: actions/cache@v4
+        with:
+          path: /usr/local/cuda-12.8
+          key: cuda-toolkit-12-8-${{ runner.os }}
+
+      - name: Install CUDA Toolkit
+        if: steps.cache-cuda.outputs.cache-hit != 'true'
+        run: |
+          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
+          sudo dpkg -i cuda-keyring_1.1-1_all.deb
+          sudo apt-get update
+          sudo apt-get install -y cuda-toolkit-12-8
+
+      - name: Setup CUDA environment
+        run: |
+          echo "CUDA_HOME=/usr/local/cuda-12.8" >> $GITHUB_ENV
+          echo "/usr/local/cuda-12.8/bin" >> $GITHUB_PATH
+
+      - name: Verify CUDA
+        run: |
+          nvidia-smi
+          nvcc --version
+
+      - name: Install vLLM from main branch
+        run: |
+          uv pip uninstall -y vllm || true
+          uv pip install git+https://github.com/vllm-project/vllm.git@main
+
+      - name: Get vLLM version
+        id: vllm-info
+        run: |
+          VERSION=$(uv run python -c "import vllm; print(vllm.__version__)")
+          echo "version=$VERSION" >> $GITHUB_OUTPUT
+          echo "Testing vLLM version: $VERSION"
+
+      - name: Run tests
+        run: uv run pytest --disable-pytest-warnings --runslow -v -s tests/slow_tests/test_vllm_model.py
diff --git a/README.md b/README.md
@@ -25,6 +25,9 @@
   <a href="https://huggingface.co/docs/lighteval/main/en/index" target="_blank">
     <img alt="Documentation" src="https://img.shields.io/badge/Documentation-4F4F4F?style=for-the-badge&logo=readthedocs&logoColor=white" />
   </a>
+  <a href="https://huggingface.co/spaces/OpenEvals/open_benchmark_index" target="_blank">
+    <img alt="Open Benchmark Index" src="https://img.shields.io/badge/Open%20Benchmark%20Index-4F4F4F?style=for-the-badge&logo=huggingface&logoColor=white" />
+  </a>
 </p>
 
 ---
@@ -39,7 +42,10 @@ sample-by-sample results* to debug and see how your models stack-up.
 
 ## Available Tasks
 
-Lighteval supports **7,000+ evaluation tasks** across multiple domains and languages. Here's an overview of some *popular benchmarks*:
+Lighteval supports **1000+ evaluation tasks** across multiple domains and
+languages. Use [this
+space](https://huggingface.co/spaces/OpenEvals/open_benchmark_index) to find what
+you need, or, here's an overview of some *popular benchmarks*:
 
 
 ### 📚 **Knowledge**
@@ -62,7 +68,7 @@ Lighteval supports **7,000+ evaluation tasks** across multiple domains and langu
 
 ### 🌍 **Multilingual Evaluation**
 - **Cross-lingual**: XTREME, Flores200 (200 languages), XCOPA, XQuAD
-- **Language-specific**: 
+- **Language-specific**:
   - **Arabic**: ArabicMMLU
   - **Filipino**: FilBench
   - **French**: IFEval-fr, GPQA-fr, BAC-fr
@@ -71,6 +77,7 @@ Lighteval supports **7,000+ evaluation tasks** across multiple domains and langu
   - **Turkic**: TUMLU (9 Turkic languages)
   - **Chinese**: CMMLU, CEval, AGIEval
   - **Russian**: RUMMLU, Russian SQuAD
+  - **Kyrgyz**: Kyrgyz LLM Benchmark
   - **And many more...**
 
 ### 🧠 **Core Language Understanding**
@@ -94,13 +101,14 @@ If you want to push results to the **Hugging Face Hub**, add your access token a
 an environment variable:
 
 ```shell
-huggingface-cli login
+hf auth login
 ```
 
 ## 🚀 Quickstart
 
 Lighteval offers the following entry points for model evaluation:
 
+- `lighteval eval`: Evaluation models using [inspect-ai](https://inspect.aisi.org.uk/) as a backend (prefered).
 - `lighteval accelerate`: Evaluate models on CPU or one or more GPUs using [🤗
   Accelerate](https://github.com/huggingface/accelerate)
 - `lighteval nanotron`: Evaluate models in distributed settings using [⚡️
@@ -117,12 +125,10 @@ Lighteval offers the following entry points for model evaluation:
 Did not find what you need ? You can always make your custom model API by following [this guide](https://huggingface.co/docs/lighteval/main/en/evaluating-a-custom-model)
 - `lighteval custom`: Evaluate custom models (can be anything)
 
-Here's a **quick command** to evaluate using the *Accelerate backend*:
+Here's a **quick command** to evaluate using a remote inference service:
 
 ```shell
-lighteval accelerate \
-    "model_name=gpt2" \
-    "leaderboard|truthfulqa:mc|0"
+lighteval eval "hf-inference-providers/openai/gpt-oss-20b" gpqa:diamond
 ```
 
 Or use the **Python API** to run a model *already loaded in memory*!
@@ -136,7 +142,7 @@ from lighteval.pipeline import ParallelismManager, Pipeline, PipelineParameters
 
 
 MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct"
-BENCHMARKS = "lighteval|gsm8k|0"
+BENCHMARKS = "gsm8k"
 
 evaluation_tracker = EvaluationTracker(output_dir="./results")
 pipeline_params = PipelineParameters(
@@ -181,7 +187,12 @@ If you're adding a **new feature**, please *open an issue first*.
 If you open a PR, don't forget to **run the styling**!
 
 ```bash
-pip install -e .[dev]
+# For basic development (code quality, tests)
+pip install -e ".[dev]"
+
+# Or for GPU/vllm development and slow tests
+pip install -e ".[dev-gpu]"
+
 pre-commit install
 pre-commit run --all-files
 ```