feat(ci): add continuous batching to benchmarks (#41916)

McPatate · Copilot · web-flow · commit 069684ef87a8 · 2025-11-07T16:23:27.000Z
* feat(ci): add continuous batching to benchmarks * refactor(ci): PR comments * refactor(cb): when stopping, block by default * fix(benchmarks): `stream` -> `streaming` * fix(benchmarks): invalid configuration when cb has attn_impl == sdpa * tests(cb): fix attn impl * fix(benchmarks): update `get_throughput` formula * fix(benchmarks): prevent version conflicts and ensure proper cleanup in continuous batching (#42063) * Initial plan * fix(benchmarks): ensure proper cleanup and remove transformers from requirements - Remove transformers from benchmark_v2/requirements.txt to prevent version conflicts - Add try-finally block to ensure ContinuousBatchingManager.stop() is always called - This fixes TypeError about unexpected 'streaming' argument and prevents OOM from improper cleanup Co-authored-by: McPatate <9112841+McPatate@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: McPatate <9112841+McPatate@users.noreply.github.com> * fix(benchmarks): raise the exception on failure instead of ignoring we catch the exception later on and raising it here helps debugging because it will be logged * test(cb): comment out failing tests for now added a `FIXME` mark * fix(benchmarks): revert `finally` removal but keep raising exception * test(cb): fix missing `require_read_token` import * refactor(benchmarks): error if no benchmarks were run * refactor(benchmarks): change default lvls of cb bench config --------- Co-authored-by: Copilot <198982749+Copilot@users.noreply.github.com> Co-authored-by: McPatate <9112841+McPatate@users.noreply.github.com>
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -32,16 +32,16 @@ jobs:
       options: --gpus all --privileged --ipc host
     steps:
       - name: Get repo
-        uses: actions/checkout@v4
+        uses: actions/checkout@v5
         with:
-          ref: ${{ github.event.pull_request.head.sha || github.sha }}
+          fetch-depth: 1
 
       - name: Install benchmark script dependencies
         run: python3 -m pip install -r benchmark_v2/requirements.txt kernels
 
       - name: Reinstall transformers in edit mode (remove the one installed during docker image build)
         working-directory: /transformers
-        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]" && python3 -m pip uninstall -y torchvision # temp fix
+        run: python3 -m pip uninstall -y transformers && python3 -m pip install -e ".[torch]"
 
       - name: Run benchmark
         run: |
diff --git a/benchmark/requirements.txt b/benchmark/requirements.txt
@@ -1,6 +1,5 @@
 gpustat==1.1.1
 psutil==6.0.0
 psycopg2==2.9.9
-torch>=2.4.0
 hf_xet
-pandas>=1.5.0
+pandas>=1.5.0
diff --git a/benchmark_v2/framework/benchmark_config.py b/benchmark_v2/framework/benchmark_config.py
@@ -36,6 +36,7 @@ def __init__(
         warmup_iterations: int = 5,
         measurement_iterations: int = 20,
         gpu_monitoring: bool = True,  # NOTE: you may want to disable this at times as we have obsvered it could heavily slow down benchmarks on AMD
+        continuous_batching: bool = False,
         batch_size: int = 1,
         sequence_length: int = 128,
         num_tokens_to_generate: int = 128,
@@ -51,6 +52,7 @@ def __init__(
         self.warmup_iterations = warmup_iterations
         self.measurement_iterations = measurement_iterations
         self.gpu_monitoring = gpu_monitoring
+        self.continuous_batching = continuous_batching
         # Input parameters
         self.batch_size = batch_size
         self.sequence_length = sequence_length
@@ -85,6 +87,22 @@ def check_validity(self, skip_validity_check: bool = False) -> None:
         if is_fa:
             logger.warning("Flash attention does not support compile mode. Turning off compile mode.")
             self.compile_mode = None
+        # Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
+        if self.attn_implementation == "sdpa" and self.sdpa_backend is None:
+            default_backend = "flash_attention"  # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
+            logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
+            self.sdpa_backend = default_backend
+        if self.continuous_batching:
+            if self.attn_implementation == "flex_attention":
+                logger.error(
+                    "disabling continuous batching because of invalid configuration: flex attention is not supported"
+                )
+                self.continuous_batching = False
+            elif self.attn_implementation == "sdpa" and self.sdpa_backend is not None:
+                logger.warning(
+                    "when continuous batching is enabled, sdpa_backend must be None because of the attention mask, setting it to None"
+                )
+                self.sdpa_backend = "math"
 
     @property
     def hash(self) -> str:
@@ -100,6 +118,7 @@ def infer_name(self, compact: bool = True) -> str:
             attn_code += f"_{self.sdpa_backend}" if self.attn_implementation == "sdpa" else ""
             compile_str = f"compiled_{self.compile_mode}" if self.compile_mode is not None else "uncompiled"
             kernelize_str = "kernelized" if self.kernelize else "unkernelized"
+            continuous_batching_str = "cb" if self.continuous_batching else "generate"
             sep = "-"
         else:
             iter_str = f"{self.warmup_iterations} warmup, {self.measurement_iterations} iterations"
@@ -109,15 +128,19 @@ def infer_name(self, compact: bool = True) -> str:
             attn_code += f" with {self.sdpa_backend} backend" if self.attn_implementation == "sdpa" else ""
             compile_str = "compiled" if self.compile_mode is not None else "not compiled"
             kernelize_str = "kernelized" if self.kernelize else "not kernelized"
+            continuous_batching_str = "continuous batching" if self.continuous_batching else "regular generate"
             sep = ", "
-        return sep.join([iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str])
+        return sep.join(
+            [iter_str, gpu_monitor_str, dimensions_str, attn_code, compile_str, kernelize_str, continuous_batching_str]
+        )
 
     def to_dict(self) -> dict[str, Any]:
         return {
             "name": self.name,
             "warmup_iterations": self.warmup_iterations,
             "measurement_iterations": self.measurement_iterations,
             "gpu_monitoring": self.gpu_monitoring,
+            "continuous_batching": self.continuous_batching,
             "batch_size": self.batch_size,
             "sequence_length": self.sequence_length,
             "num_tokens_to_generate": self.num_tokens_to_generate,
@@ -134,6 +157,7 @@ def from_dict(cls, data: dict[str, Any], skip_validity_check: bool = False) -> "
             warmup_iterations=data.get("warmup_iterations", 5),
             measurement_iterations=data.get("measurement_iterations", 20),
             gpu_monitoring=data.get("gpu_monitoring", False),
+            continuous_batching=data.get("continuous_batching", False),
             batch_size=data.get("batch_size", 1),
             sequence_length=data.get("sequence_length", 128),
             num_tokens_to_generate=data.get("num_tokens_to_generate", 128),
@@ -191,24 +215,28 @@ def get_config_by_level(level: int) -> list[BenchmarkConfig]:
             # Usually there is not much to gain by compiling with other modes, but we allow it for level 4
             compile_modes = BenchmarkConfig.all_compiled_modes if level >= 4 else [None, "default"]
             for cm in compile_modes:
-                for kernelize_on in [False, KERNELIZATION_AVAILABLE]:
-                    configs.append(
-                        BenchmarkConfig(
-                            attn_implementation=attn_implementation,
-                            sdpa_backend=sdpa_backend,
-                            compile_mode=cm,
-                            kernelize=kernelize_on,
+                for kernelize_on in {False, KERNELIZATION_AVAILABLE}:
+                    for cb_on in [False, True]:
+                        configs.append(
+                            BenchmarkConfig(
+                                attn_implementation=attn_implementation,
+                                sdpa_backend=sdpa_backend,
+                                compile_mode=cm,
+                                kernelize=kernelize_on,
+                                continuous_batching=cb_on,
+                            )
                         )
-                    )
         return configs
     # Otherwise, we add the configs for the given level
     if level >= 0:
         configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default"))
     if level >= 1:
         configs.append(BenchmarkConfig(attn_implementation="flash_attention_2"))
         configs.append(BenchmarkConfig(attn_implementation="eager", compile_mode="default"))
+        configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", continuous_batching=True))
     if level >= 2:
         configs.append(BenchmarkConfig(attn_implementation="sdpa", compile_mode="default"))
         configs.append(BenchmarkConfig(attn_implementation="flex_attention", compile_mode="default", kernelize=True))
         configs.append(BenchmarkConfig(attn_implementation="flash_attention_2", kernelize=True))
+        configs.append(BenchmarkConfig(attn_implementation="paged|sdpa", continuous_batching=True))
     return configs
diff --git a/benchmark_v2/framework/benchmark_runner.py b/benchmark_v2/framework/benchmark_runner.py
@@ -234,8 +234,9 @@ def run_benchmark(
             self.logger.info(f"Running benchmark scenario: {config.name}")
 
             # Quick validation: try one measurement first to see if this scenario works
+            generate_fn = self.time_generate_batch if config.continuous_batching else self.time_generate
             flush_memory()
-            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
+            e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
                 max_new_tokens=1, gpu_monitor=None
             )
             if e2e_latency < 0:
@@ -245,14 +246,14 @@ def run_benchmark(
             # Warmup runs
             self.logger.info(f"Warming up with {config.warmup_iterations} iterations...")
             for _ in trange(config.warmup_iterations):
-                _ = self.time_generate(max_new_tokens=config.num_tokens_to_generate)
+                _ = generate_fn(max_new_tokens=config.num_tokens_to_generate)
             self.logger.info("Warmup over.")
 
             # Measurement runs
             result = BenchmarkResult()
             self.logger.info(f"Benchmarking with {config.measurement_iterations} iterations.")
             for _ in trange(config.measurement_iterations):
-                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = self.time_generate(
+                e2e_latency, token_generation_times, shape_and_decoded_output, gpu_metrics = generate_fn(
                     max_new_tokens=config.num_tokens_to_generate,
                     gpu_monitor=(GPUMonitor(logger=self.logger) if config.gpu_monitoring else None),
                 )
@@ -274,6 +275,58 @@ def run_benchmark(
                 "config": config,
             }
 
+    # TODO: refactor `generate_batch` to handle streaming so we can use it here
+    def time_generate_batch(
+        self,
+        max_new_tokens: int,
+        gpu_monitor: GPUMonitor | None = None,
+    ) -> tuple[float, list[float], str, GPURawMetrics | None]:
+        if gpu_monitor is not None:
+            gpu_monitor.start()
+        config = GenerationConfig(
+            max_new_tokens=max_new_tokens,
+            eos_token_id=self.tokenizer.eos_token_id,
+            pad_token_id=self.tokenizer.pad_token_id,
+            do_sample=True,
+        )
+        manager = self.model.init_continuous_batching(config)
+        manager.start()
+        try:
+            first_req_results = []
+            timestamps = []
+            wall_time_0 = time.perf_counter()
+            inputs = self.inputs["input_ids"].tolist()
+            manager.add_requests(inputs, max_new_tokens=max_new_tokens, streaming=True)
+            first_req_id = None
+            num_requests = len(inputs)
+            finished_requests = 0
+            while finished_requests < num_requests:
+                # NOTE: I don't like having the extra if stmt here, but hopefully won't degrade perf too much
+                result = manager.get_result()
+                if result:
+                    timestamps.append(time.perf_counter() - wall_time_0)
+                    if result.is_finished():
+                        finished_requests += 1
+                    if first_req_id is None:
+                        first_req_id = result.request_id
+                    if result.request_id == first_req_id:
+                        first_req_results.append(result)
+                else:
+                    if not manager.is_running():
+                        raise RuntimeError("Generation thread exited unexpectedly")
+            wall_time_1 = time.perf_counter()
+            gpu_metrics = gpu_monitor.stop_and_collect() if gpu_monitor is not None else None
+            decoded_output = self.tokenizer.decode(
+                [res.generated_tokens[0] for res in first_req_results], skip_special_tokens=True
+            )
+            shape_and_decoded_output = f"{(1, len(first_req_results))} | {decoded_output}"
+            e2e_latency = wall_time_1 - wall_time_0
+            return e2e_latency, timestamps, shape_and_decoded_output, gpu_metrics
+        except Exception as e:
+            raise e
+        finally:
+            manager.stop()
+
     def time_generate(
         self,
         max_new_tokens: int,
@@ -339,12 +392,6 @@ def run_benchmarks(
 
         n_configs = len(benchmark_configs)
         for i, config in enumerate(benchmark_configs):
-            # Handle SDPA backend if not determined by the config (needs to be done before skipping duplicates)
-            if config.attn_implementation == "sdpa" and config.sdpa_backend is None:
-                default_backend = "flash_attention"  # FIXME: torch has a _cur_sdpa_kernel_backends but it fails
-                self.logger.warning(f"No SDPA backend provided, using {default_backend} instead.")
-                config.sdpa_backend = default_backend
-
             # Skip if already run
             if config.hash in all_results:
                 self.logger.info(f"Skipping duplicate config {config.name} for model {model_id} ({i + 1}/{n_configs})")
@@ -368,21 +415,27 @@ def run_benchmarks(
             self.cleanup()
             self.save_results(model_id, all_results, timestamp=timestamp)
 
+        if len(all_results) < 1:
+            raise RuntimeError("No benchmark was run succesfully")
+
         if pretty_print_summary:
             print()
             print("=" * 100)
             print(f"Finished benchmarks in {time.perf_counter() - start_time:.2f} seconds")
             print(f"Total number of benchmarks: {len(all_results)}")
-            if len(all_results) > 0:
-                print("First run metadata:")
-                first_key = list(all_results.keys())[0]
-                first_metadata = all_results[first_key]["metadata"].to_dict()
-                hardware_info = first_metadata.pop("hardware_info")
-                pretty_print_dict(first_metadata | hardware_info, tabs=1)
+            print("First run metadata:")
+            first_key = list(all_results.keys())[0]
+            first_metadata = all_results[first_key]["metadata"].to_dict()
+            hardware_info = first_metadata.pop("hardware_info")
+            pretty_print_dict(first_metadata | hardware_info, tabs=1)
             for result in all_results.values():
                 print("=" * 100)
                 print(f"Config: {result['config'].infer_name(compact=False)}\n")
-                result["measurements"].pprint(batch_size=result["config"].batch_size, tabs=1)
+                result["measurements"].pprint(
+                    batch_size=result["config"].batch_size,
+                    num_generated_tokens=result["config"].num_tokens_to_generate,
+                    tabs=1,
+                )
             print("=" * 100)
 
         return (timestamp, all_results)
diff --git a/benchmark_v2/framework/data_classes.py b/benchmark_v2/framework/data_classes.py
@@ -36,16 +36,17 @@ def add_unit_to_duration(stats: dict[str, float]) -> dict[str, str]:
     return stats
 
 
-def equalize_lengths_and_collate(stats: list[dict[str, str]]) -> list[str]:
+def equalize_lengths_and_collate(stats: dict[str, dict[str, str]]) -> dict[str, str]:
+    """Note: This operation is destructive as it will update values in place before returning a new correctly formatted dict"""
     keys = ["avg", "std", "min", "med", "max", "p95"]
     for key in keys:
-        max_length = max(len(stat[key]) for stat in stats)
-        for stat in stats:
+        max_length = max(len(stat[key]) for stat in stats.values())
+        for stat in stats.values():
             stat[key] = stat[key].ljust(max_length, " ")
-    return [" ".join([f"{key}={stat[key]}" for key in keys]) for stat in stats]
+    return {name: " ".join([f"{key}={stat[key]}" for key in keys]) for name, stat in stats.items()}
 
 
-def pretty_print_dict(data: dict[str, Any], tabs: int = 0) -> None:
+def pretty_print_dict(data: dict[str, str], tabs: int = 0) -> None:
     max_key_length = max([len(key) for key in data.keys()])
     for key, value in data.items():
         tabs_str = "  " * tabs
@@ -141,27 +142,19 @@ def get_measured_ttft(self) -> list[float]:
     def get_measured_itl(self) -> list[float]:
         return [(dt[-1] - dt[0]) / (len(dt) - 1) for dt in self.token_generation_times if len(dt) > 1]
 
-    def get_throughput(self, batch_size: int) -> float:
-        return [
-            batch_size * len(dt) / e2e_latency
-            for e2e_latency, dt in zip(self.e2e_latency, self.token_generation_times)
-        ]
-
-    def pprint(self, batch_size: int = 0, tabs: int = 0) -> None:
-        stats_to_collate = [
-            add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
-            add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
-            add_unit_to_duration(compute_basic_statistics(self.get_measured_itl())),
-        ]
-        if batch_size > 0:
-            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size))
-            stats_to_collate.append({key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()})
-        collated_stats = equalize_lengths_and_collate(stats_to_collate)
-        dict_to_pprint = {
-            "E2E Latency": collated_stats[0],
-            "Time to First Token": collated_stats[1],
-            "Inter-Token Latency": collated_stats[2],
+    def get_throughput(self, total_generated_tokens: int) -> list[float]:
+        return [total_generated_tokens / e2e_latency for e2e_latency in self.e2e_latency]
+
+    def pprint(self, batch_size: int = 0, num_generated_tokens: int = 0, tabs: int = 0) -> None:
+        measurements = {
+            "E2E Latency": add_unit_to_duration(compute_basic_statistics(self.e2e_latency)),
+            "Time to First Token": add_unit_to_duration(compute_basic_statistics(self.get_measured_ttft())),
         }
+        itl_values = self.get_measured_itl()
+        if len(itl_values) > 0:
+            measurements["Inter-Token Latency"] = add_unit_to_duration(compute_basic_statistics(itl_values))
         if batch_size > 0:
-            dict_to_pprint["Throughput"] = collated_stats[3]
+            throughput_stats = compute_basic_statistics(self.get_throughput(batch_size * num_generated_tokens))
+            measurements["Throughput"] = {key: f"{value:.2f}tok/s" for key, value in throughput_stats.items()}
+        dict_to_pprint = equalize_lengths_and_collate(measurements)
         pretty_print_dict(dict_to_pprint, tabs=tabs)
diff --git a/benchmark_v2/requirements.txt b/benchmark_v2/requirements.txt
@@ -2,6 +2,5 @@ numpy>=1.21.0
 psutil>=5.8.0
 gpustat>=1.0.0
 torch>=2.0.0
-transformers>=4.30.0
 datasets>=2.10.0
 huggingface_hub>=0.16.0
diff --git a/benchmark_v2/run_benchmarks.py b/benchmark_v2/run_benchmarks.py
@@ -80,6 +80,10 @@
     logger.info(f"Benchmark run UUID: {benchmark_run_uuid}")
     logger.info(f"Output directory: {args.output_dir}")
 
+    # We cannot compute ITL if we don't have at least two measurements
+    if any(n <= 1 for n in args.num_tokens_to_generate):
+        raise ValueError("--num_tokens_to_generate arguments should be larger than 1")
+
     # Error out if one of the arguments is not provided
     if len(args.batch_size) * len(args.sequence_length) * len(args.num_tokens_to_generate) == 0:
         raise ValueError(
diff --git a/src/transformers/generation/continuous_batching/continuous_api.py b/src/transformers/generation/continuous_batching/continuous_api.py
diff --git a/src/transformers/generation/continuous_batching/requests.py b/src/transformers/generation/continuous_batching/requests.py
diff --git a/tests/generation/test_continuous_batching.py b/tests/generation/test_continuous_batching.py