clean up input passing

SilverSoldier · SilverSoldier · commit 76fd76ac9193 · 2025-06-06T14:52:28.000+05:30
diff --git a/vllm_spyre/model_executor/model_loader/spyre.py b/vllm_spyre/model_executor/model_loader/spyre.py
@@ -6,7 +6,6 @@
 import torch._inductor.config
 import torch.distributed as dist
 import torch.nn as nn
-#from fms.models import get_model
 from vllm.model_executor.model_loader import get_model
 from transformers import PretrainedConfig
 from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig, VllmConfig
@@ -40,17 +39,14 @@ class SpyreCausalLM(nn.Module):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
         vllm_config: VllmConfig,
         max_prompt_length: int,
         max_decode_length: int,
     ) -> None:
         super().__init__()
 
         self.logits_processor = LogitsProcessor(
-            model_config.hf_config.vocab_size, logits_as_input=True)
+            vllm_config.model_config.hf_config.vocab_size, logits_as_input=True)
         self.sampler = get_sampler()
 
         # boolean tensor of length batch size with indices:
@@ -63,15 +59,9 @@ def __init__(
 
         # FMS Model
         if envs_spyre.VLLM_SPYRE_USE_CB:
-            self.model = ContinuousBatchingFmsModel(model_config,
-                                                    parallel_config,
-                                                    scheduler_config,
-                                                    vllm_config)
+            self.model = ContinuousBatchingFmsModel(vllm_config)
         else:
             self.model = StaticBatchingFmsModel(
-                model_config,
-                parallel_config,
-                scheduler_config,
                 vllm_config,
                 max_prompt_length,
                 max_decode_length,
@@ -81,8 +71,6 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        #masks: torch.Tensor,
-        #intermediate_tensors: Optional[IntermediateTensors],
         is_prompt: bool,
         current_tkv_mask: Optional[torch.Tensor] = None,
         left_padded_prompt_mask: Optional[torch.Tensor] = None,
@@ -109,7 +97,6 @@ def forward(
         logits = self.model(
             input_ids,
             positions=positions,
-            #only_last_token=not envs_spyre.VLLM_SPYRE_USE_CB,
             **extra_kwargs,
         )
 
@@ -147,16 +134,14 @@ class FmsModelBase(nn.Module):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
         vllm_config: VllmConfig,
         max_prompt_length: int,
         max_decode_length: int,
         sendnn_dynamic: bool,
     ) -> None:
         super().__init__()
 
-        self.config: PretrainedConfig = model_config.hf_config
+        self.config: PretrainedConfig = vllm_config.model_config.hf_config
         self.dtype = torch.float16 if envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND == \
             'sendnn' else torch.float32
 
@@ -165,11 +150,11 @@ def __init__(
         self.vllm_config = vllm_config
 
         # Load the weights from the cached or downloaded files.
-        self.load_weights(model_config=model_config,
+        self.load_weights(model_config=vllm_config.model_config,
                           max_prompt_length=max_prompt_length,
                           max_decode_length=max_decode_length,
                           distributed_strategy="tp"
-                          if parallel_config.world_size > 1 else None,
+                          if vllm_config.parallel_config.world_size > 1 else None,
                           sendnn_dynamic=sendnn_dynamic)
 
 
@@ -224,7 +209,6 @@ def load_weights(
         # we can use fused weights unless running on Spyre
         fused_weights = envs_spyre.VLLM_SPYRE_DYNAMO_BACKEND != "sendnn"
 
-        #self.model = get_model(architecture="hf_configured", variant=model_config.model, model_path=model_path, source=model_source, data_type=self.dtype, distributed_strategy=distributed_strategy, group=dist.group.WORLD, fused_weights=fused_weights, linear_config=linear_config)
         self.model = get_model(vllm_config=self.vllm_config)
 
         self.model.eval()
@@ -273,30 +257,26 @@ class ContinuousBatchingFmsModel(FmsModelBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        scheduler_config: SchedulerConfig,
         vllm_config: VllmConfig,
     ) -> None:
 
         BLOCK_SIZE = 64
-        max_batch = scheduler_config.max_num_seqs
-        max_model_len = scheduler_config.max_model_len
+        max_batch = vllm_config.scheduler_config.max_num_seqs
+        max_model_len = vllm_config.scheduler_config.max_model_len
 
         # edge case: prompt fills model length: can produce 1 token with prefill
         max_prompt_length = max_model_len
         # edge case: prompt will be padded to first block:
         # can produce 1 token with prefill plus rest of model length
         max_decode_length = max_model_len - BLOCK_SIZE + 1
 
-        super().__init__(model_config,
-                         parallel_config,
+        super().__init__(vllm_config,
                          max_prompt_length,
                          max_decode_length,
                          sendnn_dynamic=True)
 
         # physical KV cache on AIU Spyre: will eventually not live in this class
-        num_kv_heads = model_config.get_num_kv_heads(parallel_config)
+        num_kv_heads = vllm_config.model_config.get_num_kv_heads(vllm_config.parallel_config)
 
         if self.config.model_type in {'llama', 'granite'}:
             num_layers = self.config.num_hidden_layers
@@ -330,9 +310,7 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        mask: torch.Tensor,
         use_cache: bool,
-        only_last_token: bool,
         current_tkv_mask: torch.Tensor,
         left_padded_prompt_mask: torch.Tensor,
         block_table: torch.Tensor,
@@ -343,36 +321,29 @@ def forward(
         output = self.model(
             input_ids,
             positions=positions,
-            mask=mask,
             past_key_value_states=self.past_key_value_states,
             use_cache=use_cache,
-            only_last_token=only_last_token,
             current_tkv_mask=current_tkv_mask,
             left_padded_prompt_mask=left_padded_prompt_mask,
             block_table=block_table,
             slot_mapping=slot_mapping,
             **extra_kwargs,
         )
 
-        logits, self.past_key_value_states = output
+        self.past_key_value_states = output
 
-        return logits
+        return output
 
 
 class StaticBatchingFmsModel(FmsModelBase):
 
     def __init__(
         self,
-        model_config: ModelConfig,
-        parallel_config: ParallelConfig,
-        _: SchedulerConfig,
         vllm_config: VllmConfig,
         max_prompt_length: int,
         max_decode_length: int,
     ) -> None:
-        super().__init__(model_config,
-                    parallel_config,
-                    vllm_config,
+        super().__init__(vllm_config,
                     max_prompt_length,
                     max_decode_length,
                     sendnn_dynamic=False)
@@ -385,20 +356,16 @@ def forward(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
-        #mask: torch.Tensor,
         **extra_kwargs,
     ) -> torch.Tensor:
 
         output = self.model(
             input_ids,
             positions=positions,
-            #mask=mask,
             intermediate_tensors=self.past_key_value_states,
             **extra_kwargs,
         )
 
-        #logits, self.past_key_value_states = output
         self.past_key_value_states = output
-        #logits = self.model.compute_logits(output)
 
         return output
diff --git a/vllm_spyre/v1/worker/spyre_model_runner.py b/vllm_spyre/v1/worker/spyre_model_runner.py
@@ -8,8 +8,6 @@
 import torch
 from torch import nn
 from vllm_spyre.v1.attention.backends.spyre import SpyreSDPAMetadata, SpyreSDPABackend
-from vllm.attention.backends.torch_sdpa import TorchSDPABackend
-from vllm.attention.backends.triton_mla import TritonMLABackend
 from vllm.config import DeviceConfig, VllmConfig
 from vllm.forward_context import get_forward_context, set_forward_context
 from vllm.logger import init_logger
@@ -122,9 +120,6 @@ def load_model(self, prompt_lens: Iterable[int],
         max_pad_length = max(prompt_lens)
         max_decode_length = max(num_decode_tokens)
         self.model = SpyreCausalLM(
-            self.model_config,
-            parallel_config=self.parallel_config,
-            scheduler_config=self.scheduler_config,
             vllm_config=self.vllm_config,
             max_prompt_length=max_pad_length,
             max_decode_length=max_decode_length,