OpenLLM-France · Jeronymous · Oct 14, 2025 · Oct 16, 2025 · Oct 23, 2025 · Oct 23, 2025
diff --git a/.github/workflows/doc-build.yml b/.github/workflows/doc-build.yml
@@ -9,6 +9,7 @@ on:
 
 jobs:
   build:
+    if: github.repository == 'huggingface/lighteval'
     uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3  # main
     with:
       commit_sha: ${{ github.sha }}

diff --git a/.github/workflows/doc-pr-build.yml b/.github/workflows/doc-pr-build.yml
@@ -9,6 +9,7 @@ concurrency:
 
 jobs:
   build:
+    if: github.repository == 'huggingface/lighteval'
     uses: huggingface/doc-builder/.github/workflows/build_pr_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3  # main
     with:
       commit_sha: ${{ github.event.pull_request.head.sha }}

diff --git a/.github/workflows/doc-pr-upload.yml b/.github/workflows/doc-pr-upload.yml
@@ -8,6 +8,7 @@ on:
 
 jobs:
   build:
+    if: github.repository == 'huggingface/lighteval'
     uses: huggingface/doc-builder/.github/workflows/upload_pr_documentation.yml@9ad2de8582b56c017cb530c1165116d40433f1c6  # main
     with:
       package_name: lighteval

diff --git a/pyproject.toml b/pyproject.toml
@@ -125,6 +125,9 @@ multilingual = [
     "pyvi", # for vietnamese tokenizer
 ]
 math = ["latex2sympy2_extended==1.0.6"]
+# Disabled: unbabel-comet pins numpy<2 (all versions through 2.2.7), which conflicts with the base numpy>=2 pin.
+# To use the COMET metric, install unbabel-comet manually
+# translation = ["unbabel-comet>=2.2.0"]
 wandb = ["wandb"]
 trackio = ["trackio"]
 

diff --git a/src/lighteval/logging/info_loggers.py b/src/lighteval/logging/info_loggers.py
@@ -343,7 +343,9 @@ def aggregate(self, task_dict: dict[str, LightevalTask], bootstrap_iters: int =
                     # The metric is in a subset which has already been computed and saved
                     continue
 
-                aggregation = task.aggregation()[metric_name]
+                aggregation = task.aggregation().get(metric_name)
+                if aggregation is None:
+                    continue
 
                 try:
                     metric_result = aggregation(metric_values)

diff --git a/src/lighteval/metrics/imports/metricx_model.py b/src/lighteval/metrics/imports/metricx_model.py
@@ -0,0 +1,57 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MetricX model wrapper using MT5ForConditionalGeneration from transformers.
+
+Instead of vendoring the custom MT5ForRegression class (which has compatibility
+issues with newer transformers versions), we load the weights into the standard
+MT5ForConditionalGeneration model and extract the regression prediction
+(logit at vocab position 250089, clamped to [0, 25]) in the same way MetricX does.
+"""
+
+import torch
+from transformers import MT5ForConditionalGeneration
+
+
+class MetricXModel:
+    """Wrapper that loads a MetricX checkpoint and performs regression inference."""
+
+    def __init__(self, model_name: str, device: str = "cpu"):
+        self.model = MT5ForConditionalGeneration.from_pretrained(model_name)
+        self.model.to(device)
+        self.model.eval()
+        self.device = device
+
+    def predict(self, input_ids: torch.LongTensor, attention_mask: torch.LongTensor) -> torch.FloatTensor:
+        """Run MetricX regression inference.
+
+        Args:
+            input_ids: Tokenized input (batch, seq_len), with EOS already removed.
+            attention_mask: Attention mask (batch, seq_len), with EOS already removed.
+
+        Returns:
+            Prediction scores (batch,), clamped to [0, 25]. Lower is better.
+        """
+        batch_size = input_ids.size(0)
+        decoder_input_ids = torch.zeros(batch_size, 1, dtype=torch.long, device=self.device)
+
+        with torch.no_grad():
+            output = self.model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                decoder_input_ids=decoder_input_ids,
+            )
+
+        # 250089 = <extra_id_10>, the token MetricX uses for regression output
+        predictions = output.logits[:, 0, 250089]
+        return torch.clamp(predictions, 0, 25)
diff --git a/src/lighteval/metrics/metrics.py b/src/lighteval/metrics/metrics.py
@@ -42,9 +42,11 @@
     BLEURT,
     MRR,
     ROUGE,
+    RULER,
     AccGoldLikelihood,
     AvgAtN,
     BertScore,
+    COMETMetric,
     ExactMatches,
     Extractiveness,
     F1_score,
@@ -53,6 +55,7 @@
     JudgeLLMSimpleQA,
     LoglikelihoodAcc,
     MajAtN,
+    MetricXMetric,
     PassAtK,
     Recall,
     StringDistance,
@@ -207,7 +210,6 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
-
     bleurt = SampleLevelMetric(
         metric_name="bleurt",
         sample_level_fn=BLEURT(),
@@ -236,6 +238,13 @@ class Metrics(Enum):
         corpus_level_fn=CorpusLevelTranslationMetric("chrf++"),
         higher_is_better=True,
     )
+    comet = SampleLevelMetric(
+        metric_name="comet",
+        sample_level_fn=COMETMetric(),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     copyright = SampleLevelMetricGrouping(
         metric_name=["longest_common_prefix_length", "edit_distance", "edit_similarity"],
         sample_level_fn=StringDistance(
@@ -445,6 +454,13 @@ class Metrics(Enum):
         corpus_level_fn=MatthewsCorrCoef(),
         higher_is_better=True,
     )
+    metricx = SampleLevelMetric(
+        metric_name="metricx",
+        sample_level_fn=MetricXMetric(),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn=np.mean,
+        higher_is_better=False,
+    )
     mrr = SampleLevelMetric(
         metric_name="mrr",
         sample_level_fn=MRR(),
@@ -550,6 +566,20 @@ class Metrics(Enum):
         corpus_level_fn=np.mean,
         higher_is_better=True,
     )
+    ruler_match_any = SampleLevelMetric(
+        metric_name="ruler_match",
+        sample_level_fn=RULER("any"),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
+    ruler_match_all = SampleLevelMetric(
+        metric_name="ruler_match",
+        sample_level_fn=RULER("all"),
+        category=SamplingMethod.GENERATIVE,
+        corpus_level_fn=np.mean,
+        higher_is_better=True,
+    )
     simpleqa_judge = SampleLevelMetricGrouping(
         metric_name=["simpleqa_judge"],
         higher_is_better={"simpleqa_judge": True},

diff --git a/src/lighteval/metrics/metrics_sample.py b/src/lighteval/metrics/metrics_sample.py
@@ -71,7 +71,7 @@ def __str__(self):
         attr_strs = []
         for k, v in attrs.items():
             if callable(v):
-                val_str = v.__name__
+                val_str = getattr(v, "__name__", type(v).__name__)
             else:
                 val_str = str(v)
             attr_strs.append(f"{k}={val_str}")
@@ -762,6 +762,39 @@ def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> dict[str
         return self.summac.score_one(inp, prediction)["score"]
 
 
+class RULER(SampleLevelComputation):
+    def __init__(
+        self,
+        aggregation_method="any",
+    ):
+        """RULER exact match class.
+
+        Args:
+            aggregation_method (str, optional): Method to aggregate multiple golds. Can be 'any' or 'all'. Defaults to 'any'.
+        """
+        if aggregation_method not in ["any", "all"]:
+            raise ValueError(f"aggregation_method must be one of 'any' or 'all'. Was {aggregation_method} instead.")
+        self.aggregation_method = aggregation_method
+
+    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
+        """Computes the metric over a list of golds and predictions for one single sample.
+
+        Args:
+            doc (Doc): The document containing gold references.
+            model_response (ModelResponse): The model's response containing predictions.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            float: Aggregated score over the current sample's items.
+        """
+        golds = doc.get_golds()
+        predictions = model_response.final_text
+        if self.aggregation_method == "any":
+            return max(1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds)
+        elif self.aggregation_method == "all":
+            return sum(1.0 if r.lower() in predictions[0].lower() else 0.0 for r in golds) / len(golds)
+
+
 class BLEURT(SampleLevelComputation):
     def __init__(self):
         """Creates a BLEURT scorer using a light bleurt-tiny-512 model.
@@ -1454,3 +1487,120 @@ def metric_names(self):
 
     def num_samples(self):
         return self.n if self.n is not None else self.k
+
+
+class COMETMetric(SampleLevelComputation):
+    def __init__(
+        self,
+        model_name: str = "Unbabel/wmt22-comet-da",
+        source_column: str = "source",
+        batch_size: int = 8,
+        gpus: int = 0,
+        accelerator: str = "cpu",
+    ):
+        """COMET metric for machine translation evaluation.
+
+        Args:
+            model_name (str): Name of the COMET model to use.
+            source_column (str): Key in doc.specific containing the source text.
+            batch_size (int): Batch size for COMET model inference.
+            gpus (int): Number of GPUs to use (0 for CPU-only).
+            accelerator (str): Accelerator to use ("cpu" or "cuda"). MPS is not supported.
+        """
+        if accelerator == "mps":
+            raise ValueError("MPS is not supported for COMET")
+
+        self.model_name = model_name
+        self.source_column = source_column
+        self.batch_size = batch_size
+        self.gpus = gpus
+        self.accelerator = accelerator
+        self._model = None
+
+    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
+        """Computes the COMET score for a single translation.
+
+        Args:
+            doc (Doc): The document containing gold references and source text in doc.specific.
+            model_response (ModelResponse): The model's response containing predictions.
+            **kwargs: Unused; kept for compatibility with the metric compute signature.
+
+        Returns:
+            float: COMET score scaled to 0-100 (higher is better).
+        """
+        if self._model is None:
+            from comet import download_model, load_from_checkpoint
+
+            logger.info(f"Loading COMET model {self.model_name}...")
+            model_path = download_model(self.model_name)
+            self._model = load_from_checkpoint(model_path)
+
+        source = doc.specific[self.source_column]
+        prediction = model_response.final_text[0]
+        reference = doc.get_golds()[0]
+
+        data = [{"src": source, "mt": prediction, "ref": reference}]
+        output = self._model.predict(
+            data,
+            batch_size=self.batch_size,
+            gpus=self.gpus,
+            accelerator=self.accelerator,
+        )
+        return output.scores[0] * 100
+
+
+class MetricXMetric(SampleLevelComputation):
+    def __init__(
+        self,
+        model_name: str = "google/metricx-24-hybrid-large-v2p6",
+        tokenizer_name: str = "google/mt5-large",
+        source_column: str = "source",
+        batch_size: int = 8,
+        device: str = "cpu",
+    ):
+        """MetricX metric for machine translation evaluation.
+
+        Args:
+            model_name (str): Name of the MetricX model to use.
+            tokenizer_name (str): Name of the tokenizer to use.
+            source_column (str): Key in doc.specific containing the source text.
+            batch_size (int): Batch size for tokenization.
+            device (str): Device to run inference on ("cpu", "cuda").
+        """
+        self.model_name = model_name
+        self.tokenizer_name = tokenizer_name
+        self.source_column = source_column
+        self.batch_size = batch_size
+        self.device = device
+        self._model = None
+        self._tokenizer = None
+
+    def compute(self, doc: Doc, model_response: ModelResponse, **kwargs) -> float:
+        """Computes the MetricX score for a single translation.
+
+        Args:
+            doc (Doc): The document containing gold references and source text in doc.specific.
+            model_response (ModelResponse): The model's response containing predictions.
+            **kwargs: Unused; kept for compatibility with the metric compute signature.
+
+        Returns:
+            float: MetricX score (lower is better, typically 0-25).
+        """
+        if self._model is None:
+            from lighteval.metrics.imports.metricx_model import MetricXModel
+
+            logger.info(f"Loading MetricX model {self.model_name}...")
+            self._model = MetricXModel(self.model_name, device=self.device)
+            self._tokenizer = AutoTokenizer.from_pretrained(self.tokenizer_name)
+
+        source = doc.specific[self.source_column]
+        prediction = model_response.final_text[0]
+        reference = doc.get_golds()[0]
+
+        input_text = f"candidate: {prediction} reference: {reference} source: {source}"
+        inputs = self._tokenizer(input_text, return_tensors="pt", truncation=True, max_length=1024)
+        # MetricX requires removing the EOS token appended by the tokenizer
+        input_ids = inputs["input_ids"][:, :-1].to(self.device)
+        attention_mask = inputs["attention_mask"][:, :-1].to(self.device)
+
+        return self._model.predict(input_ids, attention_mask).item()
-Original file line number
+Diff line change
@@ Expand Up / @@ -9,6 +9,7 @@ on: @@
     jobs:
       build:
+        if: github.repository == 'huggingface/lighteval'
         uses: huggingface/doc-builder/.github/workflows/build_main_documentation.yml@90b4ee2c10b81b5c1a6367c4e6fc9e2fb510a7e3  # main
         with:
           commit_sha: ${{ github.sha }}
@@ Expand Down @@