From 2c95f78a9fc930eaaefd4eef43f56b43bbc8d308 Mon Sep 17 00:00:00 2001 From: ethan Date: Fri, 12 Sep 2025 10:05:06 -0700 Subject: [PATCH 01/44] add qwen3_vl --- optimum/exporters/openvino/model_configs.py | 302 +++++++ optimum/exporters/openvino/model_patcher.py | 76 ++ .../openvino/modeling_visual_language.py | 735 +++++++++++++++++- 3 files changed, 1078 insertions(+), 35 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 9c1684db81..ed63baca1c 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -138,6 +138,8 @@ Qwen2MoEPatcher, Qwen2VLLanguageModelPatcher, Qwen2VLVisionEmbMergerPatcher, + Qwen3VLVisionEmbMergerPatcher, + Qwen3VLLanguageModelPatcher, Qwen3MoeModelPatcher, QwenModelPatcher, SanaTextEncoderModelPatcher, @@ -164,6 +166,10 @@ def init_model_configs(): "transformers", "AutoModelForImageTextToText", ) + TasksManager._CUSTOM_CLASSES[("pt", "qwen3_vl", "image-text-to-text")] = ( + "transformers", + "AutoModelForImageTextToText", + ) TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( "transformers", "AutoModelForVision2Seq", @@ -333,6 +339,57 @@ def patch_model_for_export( ) -> "ModelPatcher": return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs) +class DummyQwen3VLLMInputGenerator(DummyTextInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "input_ids", + "attention_mask", + "encoder_attention_mask", + "token_type_ids", + "position_ids", + "visual_pos_masks", + "deepstack_visual_embeds", + ) + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32", bool_dtype: str = "bool"): + if input_name == "deepstack_visual_embeds": + return self.random_float_tensor([3, 32, 2560], framework=framework, dtype=float_dtype) + if input_name == "visual_pos_masks": + return self.constant_tensor( + shape=[self.batch_size, 16], + framework=framework, + value=1, + dtype=DTYPE_MAPPER.pt(bool_dtype), + ) + return super().generate(input_name, framework, int_dtype, float_dtype) + +@register_in_tasks_manager( + "qwen3_vl_text", + *[ + "text-generation", + "text-generation-with-past", + ], + library_name="transformers", +) +class Qwen3VLTextOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): + MIN_TRANSFORMERS_VERSION = "4.56.0" + + DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLLMInputGenerator, GemmaDummyPastKeyValuesGenerator) + DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator + NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + common_inputs = super().inputs + common_inputs["visual_pos_masks"] = {0: "batch_size", 1: "sequence_length"} + common_inputs["deepstack_visual_embeds"] = {0: "num_layers", 1: "visual_seqlen", 2: "embed_dim"} + return common_inputs + + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ) -> "ModelPatcher": + return OVDecoderModelPatcher(self, model, model_kwargs=model_kwargs) + + @register_in_tasks_manager( "qwen3_moe", @@ -3437,6 +3494,8 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return generated_input + + class DummyQwen2VLVisionEmbedInputGenerator(DummyVisionInputGenerator): SUPPORTED_INPUT_NAMES = ( "hidden_states", @@ -3503,6 +3562,75 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int return self.random_int_tensor([hidden_size], max_value=hidden_size) +class DummyQwen3VLVisionEmbedInputGenerator(DummyVisionInputGenerator): + SUPPORTED_INPUT_NAMES = ( + "hidden_states", + "attention_mask", + "window_attention_mask", + "window_index", + "rotary_pos_emb", + "input", + ) + + def __init__( + self, + task: str, + normalized_config: NormalizedVisionConfig, + batch_size: int = 1, + num_channels: int = DEFAULT_DUMMY_SHAPES["num_channels"], + width: int = 420, + height: int = 420, + **kwargs, + ): + self.batch_size = batch_size + self.height = height + self.width = width + self.num_channels = num_channels + self.temporal_patch_size = normalized_config.config.temporal_patch_size + self.patch_size = normalized_config.config.patch_size + if normalized_config.use_embed_dim: + self.embed_dim = ( + normalized_config.config.embed_dim + if hasattr(normalized_config.config, "embed_dim") + else normalized_config.hidden_size + ) + else: + self.embed_dim = self.num_channels * self.temporal_patch_size * self.patch_size * self.patch_size + self.num_heads = normalized_config.config.num_heads + self.spatial_merge_size = None + if hasattr(normalized_config.config, "spatial_merge_size"): + self.spatial_merge_size = normalized_config.config.spatial_merge_size + + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): + grid_h, grid_w = self.height // self.patch_size, self.width // self.patch_size + grid_t = self.batch_size + + if input_name == "hidden_states": + return self.random_float_tensor( + [grid_t * grid_h * grid_w, self.embed_dim], framework=framework, dtype=float_dtype + ) + + if input_name in ["attention_mask", "window_attention_mask"]: + return self.random_mask_tensor( + [1, grid_t * grid_h * grid_w, grid_t * grid_h * grid_w], framework=framework, dtype=float_dtype + ) + + if input_name == "rotary_pos_emb": + dim = self.embed_dim // self.num_heads // 2 + return self.random_float_tensor([grid_h * grid_t * grid_w, dim], framework=framework, dtype=float_dtype) + + if input_name == "input": + return self.constant_tensor([4, 2520], framework=framework, value=0, dtype=DTYPE_MAPPER.pt(int_dtype)) + + if input_name == "window_index": + if self.spatial_merge_size is None: + raise ValueError( + "`spatial_merge_size` parameter is not found in model config. Can not generate dummy input data for `window_index` input" + ) + spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size + hidden_size = (grid_t * grid_h * grid_w) // spatial_merge_unit + return self.random_int_tensor([hidden_size], max_value=hidden_size) + class Qwen2VLConfigBehavior(str, enum.Enum): LANGUAGE = "language" VISION_EMBEDDINGS = "vision_embeddings" @@ -3674,6 +3802,180 @@ def patch_model_for_export( if self._behavior == Qwen2VLConfigBehavior.VISION_EMBEDDINGS_MERGER: return Qwen2_5_VLVisionEmbMergerPatcher(self, model, model_kwargs) return super().patch_model_for_export(model, model_kwargs) + +class Qwen3VLConfigBehavior(str, enum.Enum): + LANGUAGE = "language" + VISION_EMBEDDINGS = "vision_embeddings" + VISION_EMBEDDINGS_MERGER = "vision_embeddings_merger" + TEXT_EMBEDDINGS = "text_embeddings" + VISION_EMBEDDINGS_POS = "vision_embeddings_pos" + +@register_in_tasks_manager( + "qwen3_vl", + *["image-text-to-text", "video-text-to-text"], + library_name="transformers", +) +class Qwen3_VLOpenVINOConfig(BaseVLMOpenVINOConfig): + SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen3VLConfigBehavior] + NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig + DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLVisionEmbedInputGenerator,) + MIN_TRANSFORMERS_VERSION = version.parse("4.45.0") + + def __init__( + self, + config: "PretrainedConfig", + task: str = "feature-extraction", + int_dtype: str = "int64", + float_dtype: str = "fp32", + behavior: Qwen3VLConfigBehavior = Qwen3VLConfigBehavior.VISION_EMBEDDINGS, + preprocessors: Optional[List[Any]] = None, + ): + super().__init__( + config=config, + task=task, + int_dtype=int_dtype, + float_dtype=float_dtype, + preprocessors=preprocessors, + ) + self._behavior = behavior + self._orig_config = config + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self._normalized_config.use_embed_dim = False + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self._normalized_config.use_embed_dim = True + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS and hasattr(config, "vision_config"): + self._config = config.vision_config + self._normalized_config = self.NORMALIZED_CONFIG_CLASS(self._config) + self._normalized_config.use_embed_dim = True + + + + + @staticmethod + def get_model_for_behavior(model, behavior: Union[str, Qwen3VLConfigBehavior]): + if isinstance(behavior, str) and not isinstance(behavior, Qwen3VLConfigBehavior): + behavior = Qwen3VLConfigBehavior(behavior) + + if behavior == Qwen3VLConfigBehavior.LANGUAGE: + return model + + if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS: + vision_embeddings = model.visual.patch_embed + vision_embeddings.config = model.config.vision_config + return vision_embeddings + + if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + vision_emb_merger = model.visual + vision_emb_merger.config = model.config.vision_config + return vision_emb_merger + + if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS: + vision_emb_pos = model.visual.pos_embed + vision_emb_pos.config = model.config.vision_config + return vision_emb_pos + + if behavior == Qwen3VLConfigBehavior.TEXT_EMBEDDINGS: + text_embedding = ( + model.model.embed_tokens if hasattr(model.model, "embed_tokens") else model.language_model.embed_tokens + ) + text_embedding.config = model.config + return text_embedding + + def with_behavior( + self, + behavior: Union[str, Qwen3VLConfigBehavior], + ): + """ + Creates a config for different behaviour. + Args: + behavior ([`ConfigBehavior`]): + The behavior to use for the new instance. + """ + if isinstance(behavior, str) and not isinstance(behavior, Qwen3VLConfigBehavior): + behavior = Qwen3VLConfigBehavior(behavior) + + if behavior == Qwen3VLConfigBehavior.TEXT_EMBEDDINGS: + return get_vlm_text_embeddings_config("qwen3_vl_text", self._orig_config.text_config, self.int_dtype, self.float_dtype) + + if behavior == Qwen3VLConfigBehavior.LANGUAGE: + return get_vlm_text_generation_config( + "qwen3_vl_text", + self._orig_config.text_config, + self.int_dtype, + self.float_dtype, + model_patcher=Qwen3VLLanguageModelPatcher, + dummy_input_generator=DummyQwen2VLLMInputGenerator, + inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, + ) + + if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + def patch_model_for_export( + self, model: Union["PreTrainedModel", "TFPreTrainedModel"], model_kwargs: Optional[Dict[str, Any]] = None + ): + model_kwargs = model_kwargs or {} + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return Qwen3VLVisionEmbMergerPatcher(self, model, model_kwargs) + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS or self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS: + return ModelPatcher(self, model, model_kwargs=model_kwargs) + return super().patch_model_for_export(model, model_kwargs) + + + @property + def inputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS: + return {"hidden_states": {0: "patch_thw_grid", 1: "patch_temporal_channels"}} + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return { + "hidden_states": {0: "sequence_length"}, + "attention_mask": {1: "sequence_length", 2: "sequence_length"}, + "rotary_pos_emb": {0: "sequence_length"}, + } + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS: + return { + "input": {0: "sequence_length", 1: "sequence_length"}, + } + + + + @property + def outputs(self) -> Dict[str, Dict[int, str]]: + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS: + return {"last_hidden_state": {0: "seq_len"}} + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return {"last_hidden_state": {0: "seq_len"}, "deepstack_feature_lists": {0: "seq_len"}} + if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS: + return {"last_hidden_state": {0: "seq_len", 1: "seq_len"}} + return {} @register_in_tasks_manager( diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 851308e29e..74f1540c05 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4337,6 +4337,42 @@ def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) self._model.forward = self._model.__orig_forward + +class Qwen3VLLanguageModelPatcher(OVDecoderModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Optional[Dict[str, Any]] = None, + ): + + # Adopted from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/phi4_multimodal/modeling_phi4_multimodal.py#L2156-L2178 + # moved audio and vision features processing outside model + def lm_forward(self, attention_mask, position_ids, past_key_values, inputs_embeds, visual_pos_masks, deepstack_visual_embeds, use_cache=True): + from transformers.cache_utils import DynamicCache + + pkv = DynamicCache.from_legacy_cache(past_key_values) + outputs = self.model.language_model( + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + use_cache=use_cache, + past_key_values=pkv, + visual_pos_masks=visual_pos_masks, + deepstack_visual_embeds=deepstack_visual_embeds, + ) + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + logits = self.lm_head(hidden_states) + return (logits, outputs.past_key_values.to_legacy_cache()) + + model.__orig_forward = model.forward + model.forward = types.MethodType(lm_forward, model) + super().__init__(config, model, model_kwargs) + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward def patch_qwen2vl_vision_blocks(model, force_new_behaviour=False): if not force_new_behaviour and is_transformers_version("<=", "4.48.99"): @@ -4550,6 +4586,46 @@ def __exit__(self, exc_type, exc_value, traceback): for block in self._model.blocks: block.forward = block._orig_forward block.attn.forward = block.attn._orig_forward + +class Qwen3VLVisionEmbMergerPatcher(ModelPatcher): + def __init__( + self, + config: "OnnxConfig", + model: Union["PreTrainedModel", "TFPreTrainedModel"], + model_kwargs: Dict[str, Any] = None, + ): + model.__orig_forward = model.forward + + # Modified from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1118 + # added attention_mask input instead cu_lens for its internal calculation model (unsupported by tracing due to cycle with dynamic len) + # separated patch_embed and rot_pos_emb calls for performing as part of another model + def image_embed_forward( + self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, rotary_pos_emb: torch.Tensor + ) -> torch.Tensor: + deepstack_feature_lists = [] + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk(hidden_states, attention_mask=attention_mask, rotary_pos_emb=rotary_pos_emb) + if layer_num in self.deepstack_visual_indexes: + deepstack_feature = self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)]( + hidden_states + ) + deepstack_feature_lists.append(deepstack_feature) + last_hidden_state = self.merger(hidden_states) + return last_hidden_state, torch.stack(deepstack_feature_lists, dim=0) + + model.forward = types.MethodType(image_embed_forward, model) + super().__init__(config, model, model_kwargs) + + def __enter__(self): + patch_qwen2vl_vision_blocks(self._model) + super().__enter__() + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + self._model.forward = self._model.__orig_forward + for block in self._model.blocks: + block.forward = block._orig_forward + block.attn.forward = block.attn._orig_forward # copied from https://github.com/huggingface/transformers/blob/v4.47.1/src/transformers/models/granitemoe/modeling_granitemoe.py#L321 diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index b88d7097a7..555991d974 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -144,6 +144,9 @@ def prepare_inputs( position_ids: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, token_type_ids: Optional[torch.LongTensor] = None, + visual_pos_masks: Optional[torch.FloatTensor] = None, + deepstack_visual_embeds: Optional[torch.FloatTensor] = None, + **kwargs, ): batch_size = input_ids.shape[0] if input_ids is not None else inputs_embeds.shape[0] @@ -186,11 +189,24 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] - if self.config.model_type == "qwen2_vl" and position_ids.ndim != 3: + if (self.config.model_type == "qwen2_vl" or self.config.model_type == "qwen3_vl") and position_ids.ndim != 3: position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) inputs["position_ids"] = position_ids + if "visual_pos_masks" in self.input_names: + if visual_pos_masks is not None: + inputs["visual_pos_masks"] = visual_pos_masks + else: + inputs["visual_pos_masks"] = torch.ones(1, 1, dtype=torch.bool) + + if "deepstack_visual_embeds" in self.input_names: + if isinstance(deepstack_visual_embeds, list): + inputs["deepstack_visual_embeds"] = torch.Tensor(deepstack_visual_embeds) + else: + inputs["deepstack_visual_embeds"] = torch.ones((3, 1, 1), dtype=torch.float32) + print(inputs["deepstack_visual_embeds"].shape) + if "token_type_ids" in self.input_names: if token_type_ids is None: token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int) @@ -200,7 +216,11 @@ def prepare_inputs( inputs["beam_idx"] = ( self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) ) - + for key, value in inputs.items(): + if hasattr(value, 'dtype'): + print(f"{key}: {value.dtype}") + else: + print(f"{key}: {type(value)}") return inputs def forward( @@ -210,6 +230,8 @@ def forward( past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None, position_ids: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.LongTensor] = None, + visual_pos_masks: Optional[torch.FloatTensor] = None, + deepstack_visual_embeds: Optional[torch.FloatTensor] = None, **kwargs, ): self.compile() @@ -220,6 +242,8 @@ def forward( past_key_values=past_key_values, position_ids=position_ids, inputs_embeds=inputs_embeds, + visual_pos_masks=visual_pos_masks, + deepstack_visual_embeds=deepstack_visual_embeds, **kwargs, ) # Run inference @@ -332,6 +356,7 @@ def forward(self, audio_feature, audio_mask): "vision_resampler": OVVisionResampler, "multi_modal_projector": OVMultiModalProjector, "vision_embeddings_merger": OVVisionEmbedding, + "vision_embeddings_pos": OVVisionProjection, "audio_embeddings": OVAudioEmbeddings, "audio_forward_embeddings": OVAudioEmbeddings, "audio_encoder": OVAudioEncoder, @@ -767,38 +792,75 @@ def forward( ): if pixel_values is None: pixel_values = images if images is not None else image_pixel_values - inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings( - input_ids, - pixel_values, - image_sizes=image_sizes, - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - image_bound=image_bound, - tgt_sizes=tgt_sizes, - pixel_values_videos=pixel_values_videos, - image_grid_thw=image_grid_thw, - video_grid_thw=video_grid_thw, - rope_deltas=rope_deltas, - second_per_grid_ts=second_per_grid_ts, - pixel_attention_mask=pixel_attention_mask, - input_image_embeds=input_image_embeds, - image_attention_mask=image_attention_mask, - input_audio_embeds=input_audio_embeds if input_audio_embeds is not None else audio_input_features, - audio_embed_sizes=audio_embed_sizes, - audio_attention_mask=audio_attention_mask, - input_mode=input_mode, - **kwargs, - ) - return self.language_model.forward( - input_ids=None, - inputs_embeds=inputs_embeds, - attention_mask=attention_mask, - position_ids=position_ids, - token_type_ids=token_type_ids, - past_key_values=past_key_values, - **kwargs, - ) + if self.config.model_type == "qwen3_vl": + inputs_embeds, attention_mask, position_ids, visual_pos_masks, deepstack_visual_embeds = self.get_multimodal_embeddings( + input_ids, + pixel_values, + inputs_embeds=inputs_embeds, + image_sizes=image_sizes, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + image_bound=image_bound, + tgt_sizes=tgt_sizes, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + rope_deltas=rope_deltas, + second_per_grid_ts=second_per_grid_ts, + pixel_attention_mask=pixel_attention_mask, + input_image_embeds=input_image_embeds, + image_attention_mask=image_attention_mask, + input_audio_embeds=input_audio_embeds if input_audio_embeds is not None else audio_input_features, + audio_embed_sizes=audio_embed_sizes, + audio_attention_mask=audio_attention_mask, + input_mode=input_mode, + **kwargs, + ) + return self.language_model.forward( + input_ids=None, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + token_type_ids=token_type_ids, + past_key_values=past_key_values, + visual_pos_masks=visual_pos_masks, + deepstack_visual_embeds=deepstack_visual_embeds, + **kwargs, + ) + else: + inputs_embeds, attention_mask, position_ids = self.get_multimodal_embeddings( + input_ids, + pixel_values, + image_sizes=image_sizes, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + image_bound=image_bound, + tgt_sizes=tgt_sizes, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + rope_deltas=rope_deltas, + second_per_grid_ts=second_per_grid_ts, + pixel_attention_mask=pixel_attention_mask, + input_image_embeds=input_image_embeds, + image_attention_mask=image_attention_mask, + input_audio_embeds=input_audio_embeds if input_audio_embeds is not None else audio_input_features, + audio_embed_sizes=audio_embed_sizes, + audio_attention_mask=audio_attention_mask, + input_mode=input_mode, + **kwargs, + ) + return self.language_model.forward( + input_ids=None, + inputs_embeds=inputs_embeds, + attention_mask=attention_mask, + position_ids=position_ids, + token_type_ids=token_type_ids, + past_key_values=past_key_values, + **kwargs, + ) def _reorder_cache(self, past_key_values, beam_idx): return self.language_model._reorder_cache(past_key_values, beam_idx) @@ -2488,6 +2550,26 @@ class QWen2VLModelOutputWithPast(ModelOutput): second_per_grid_ts: Optional[torch.FloatTensor] = None +# @dataclass +# class QWen3VLModelOutputWithPast(ModelOutput): +# r""" +# past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): +# Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape +# `(batch_size, num_heads, sequence_length, embed_size_per_head)`) + +# Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see +# `past_key_values` input) to speed up sequential decoding. +# rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): +# The rope index difference between sequence length and multimodal rope. +# """ + +# last_hidden_state: Optional[torch.FloatTensor] = None +# past_key_values: Optional[list[torch.FloatTensor]] = None +# hidden_states: Optional[tuple[torch.FloatTensor]] = None +# attentions: Optional[tuple[torch.FloatTensor]] = None +# rope_deltas: Optional[torch.LongTensor] = None + + class _OVQwen2VLForCausalLM(OVModelForVisualCausalLM): additional_parts = ["vision_embeddings_merger"] @@ -3353,7 +3435,57 @@ def preprocess_inputs( inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt") return inputs - # Copied from https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L1602 + return model_kwargs + +class _OVQwen3VLForCausalLM(OVModelForVisualCausalLM): + additional_parts = ["vision_embeddings_merger", "vision_embeddings_pos"] + + def __init__( + self, + language_model: ov.Model, + text_embeddings: ov.Model, + vision_embeddings: ov.Model, + config: PretrainedConfig = None, + device: str = "CPU", + dynamic_shapes: bool = None, + ov_config: Optional[Dict[str, str]] = None, + model_save_dir: Optional[Union[str, Path, TemporaryDirectory]] = None, + quantization_config: Union[OVWeightQuantizationConfig, Dict] = None, + **kwargs, + ): + super().__init__( + language_model=language_model, + text_embeddings=text_embeddings, + vision_embeddings=vision_embeddings, + config=config, + device=device, + dynamic_shapes=dynamic_shapes, + ov_config=ov_config, + model_save_dir=model_save_dir, + quantization_config=quantization_config, + **kwargs, + ) + self.rope_deltas = None # cache rope_deltas here + + if is_transformers_version(">=", "4.56.0"): + from transformers.models.qwen3_vl.modeling_qwen3_vl import ( + Qwen3VLVisionRotaryEmbedding as VisionRotaryEmbedding, + ) + from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLVisionRotaryEmbedding + + self._rotary_pos_emb = VisionRotaryEmbedding( + self.config.vision_config.hidden_size // self.config.vision_config.num_heads // 2 + ) + self.num_grid_per_side = int(config.vision_config.num_position_embeddings**0.5) + self.spatial_merge_size = config.vision_config.spatial_merge_size + head_dim = config.vision_config.hidden_size // config.vision_config.num_heads + self.rotary_pos_emb = Qwen3VLVisionRotaryEmbedding(head_dim // 2) + + else: + raise ValueError( + f"Initialization model for {self.config.model_type} required at least transformers >= 4.45" + ) + def _update_model_kwargs_for_generation( self, outputs: ModelOutput, @@ -3374,6 +3506,538 @@ def _update_model_kwargs_for_generation( return model_kwargs + def prepare_inputs_for_generation( + self, + input_ids, + past_key_values=None, + attention_mask=None, + inputs_embeds=None, + cache_position=None, + position_ids=None, + use_cache=True, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + **kwargs, + ): + # Overwritten -- in specific circumstances we don't want to forward image inputs to the model + if past_key_values is not None: + if inputs_embeds is not None and input_ids.shape[1] == 0: # Exception 4 + inputs_embeds = inputs_embeds[:, -cache_position.shape[0] :] + elif inputs_embeds is not None: + input_ids = input_ids[:, -cache_position.shape[0] :] + elif input_ids.shape[1] != cache_position.shape[0]: # Default case (the "else", a no op, is Exception 2) + input_ids = input_ids[:, cache_position] + + if cache_position[0] != 0: + pixel_values = None + pixel_values_videos = None + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]: + model_inputs = {"inputs_embeds": inputs_embeds, "input_ids": None} + else: + model_inputs = {"input_ids": input_ids, "inputs_embeds": None} + + model_inputs.update( + { + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": use_cache, + "attention_mask": attention_mask, + "pixel_values": pixel_values, + "pixel_values_videos": pixel_values_videos, + "image_grid_thw": image_grid_thw, + "video_grid_thw": video_grid_thw, + "cache_position": cache_position, + } + ) + return model_inputs + + + def get_rope_index( + self, + input_ids: Optional[torch.LongTensor] = None, + image_grid_thw: Optional[torch.LongTensor] = None, + video_grid_thw: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + ) -> tuple[torch.Tensor, torch.Tensor]: + """Different from the original implementation, Qwen3VL use timestamps rather than absolute time position ids.""" + + # Since we use timestamps to seperate videos, like , the video_grid_thw should also be split + if video_grid_thw is not None: + video_grid_thw = torch.repeat_interleave(video_grid_thw, video_grid_thw[:, 0], dim=0) + video_grid_thw[:, 0] = 1 + + spatial_merge_size = self.config.vision_config.spatial_merge_size + image_token_id = self.config.image_token_id + video_token_id = self.config.video_token_id + vision_start_token_id = self.config.vision_start_token_id + mrope_position_deltas = [] + if input_ids is not None and (image_grid_thw is not None or video_grid_thw is not None): + total_input_ids = input_ids + if attention_mask is None: + attention_mask = torch.ones_like(total_input_ids) + position_ids = torch.ones( + 3, + input_ids.shape[0], + input_ids.shape[1], + dtype=input_ids.dtype, + device=input_ids.device, + ) + image_index, video_index = 0, 0 + attention_mask = attention_mask.to(total_input_ids.device) + for i, input_ids in enumerate(total_input_ids): + input_ids = input_ids[attention_mask[i] == 1] + image_nums, video_nums = 0, 0 + vision_start_indices = torch.argwhere(input_ids == vision_start_token_id).squeeze(1) + vision_tokens = input_ids[vision_start_indices + 1] + image_nums = (vision_tokens == image_token_id).sum() + video_nums = (vision_tokens == video_token_id).sum() + input_tokens = input_ids.tolist() + llm_pos_ids_list: list = [] + st = 0 + remain_images, remain_videos = image_nums, video_nums + for _ in range(image_nums + video_nums): + if image_token_id in input_tokens and remain_images > 0: + ed_image = input_tokens.index(image_token_id, st) + else: + ed_image = len(input_tokens) + 1 + if video_token_id in input_tokens and remain_videos > 0: + ed_video = input_tokens.index(video_token_id, st) + else: + ed_video = len(input_tokens) + 1 + if ed_image < ed_video: + t, h, w = ( + image_grid_thw[image_index][0], + image_grid_thw[image_index][1], + image_grid_thw[image_index][2], + ) + image_index += 1 + remain_images -= 1 + ed = ed_image + + else: + t, h, w = ( + video_grid_thw[video_index][0], + video_grid_thw[video_index][1], + video_grid_thw[video_index][2], + ) + video_index += 1 + remain_videos -= 1 + ed = ed_video + llm_grid_t, llm_grid_h, llm_grid_w = ( + t.item(), + h.item() // spatial_merge_size, + w.item() // spatial_merge_size, + ) + text_len = ed - st + + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + # t_index is always 0 because llm_grid_t is always 1 (we use timestamps to encode the temporal information for videos) + t_index = torch.arange(llm_grid_t).view(-1, 1).expand(-1, llm_grid_h * llm_grid_w).flatten() + h_index = torch.arange(llm_grid_h).view(1, -1, 1).expand(llm_grid_t, -1, llm_grid_w).flatten() + w_index = torch.arange(llm_grid_w).view(1, 1, -1).expand(llm_grid_t, llm_grid_h, -1).flatten() + llm_pos_ids_list.append(torch.stack([t_index, h_index, w_index]) + text_len + st_idx) + st = ed + llm_grid_t * llm_grid_h * llm_grid_w + + if st < len(input_tokens): + st_idx = llm_pos_ids_list[-1].max() + 1 if len(llm_pos_ids_list) > 0 else 0 + text_len = len(input_tokens) - st + llm_pos_ids_list.append(torch.arange(text_len).view(1, -1).expand(3, -1) + st_idx) + + llm_positions = torch.cat(llm_pos_ids_list, dim=1).reshape(3, -1) + position_ids[..., i, attention_mask[i] == 1] = llm_positions.to(position_ids.device) + mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i])) + mrope_position_deltas = torch.tensor(mrope_position_deltas, device=input_ids.device).unsqueeze(1) + return position_ids, mrope_position_deltas + else: + if attention_mask is not None: + position_ids = attention_mask.long().cumsum(-1) - 1 + position_ids.masked_fill_(attention_mask == 0, 1) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1).to(attention_mask.device) + max_position_ids = position_ids.max(0, keepdim=False)[0].max(-1, keepdim=True)[0] + mrope_position_deltas = max_position_ids + 1 - attention_mask.shape[-1] + else: + position_ids = ( + torch.arange(input_ids.shape[1], device=input_ids.device) + .view(1, 1, -1) + .expand(3, input_ids.shape[0], -1) + ) + mrope_position_deltas = torch.zeros( + [input_ids.shape[0], 1], + device=input_ids.device, + dtype=input_ids.dtype, + ) + + return position_ids, mrope_position_deltas + + def rot_pos_emb(self, grid_thw: torch.Tensor) -> torch.Tensor: + merge_size = self.spatial_merge_size + + max_hw = int(grid_thw[:, 1:].max().item()) + freq_table = self.rotary_pos_emb(max_hw) # (max_hw, dim // 2) + device = freq_table.device + + total_tokens = int(torch.prod(grid_thw, dim=1).sum().item()) + pos_ids = torch.empty((total_tokens, 2), dtype=torch.long, device=device) + + offset = 0 + for num_frames, height, width in grid_thw: + merged_h, merged_w = height // merge_size, width // merge_size + + block_rows = torch.arange(merged_h, device=device) # block row indices + block_cols = torch.arange(merged_w, device=device) # block col indices + intra_row = torch.arange(merge_size, device=device) # intra-block row offsets + intra_col = torch.arange(merge_size, device=device) # intra-block col offsets + + # Compute full-resolution positions + row_idx = block_rows[:, None, None, None] * merge_size + intra_row[None, None, :, None] + col_idx = block_cols[None, :, None, None] * merge_size + intra_col[None, None, None, :] + + row_idx = row_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1) + col_idx = col_idx.expand(merged_h, merged_w, merge_size, merge_size).reshape(-1) + + coords = torch.stack((row_idx, col_idx), dim=-1) + + if num_frames > 1: + coords = coords.repeat(num_frames, 1) + + num_tokens = coords.shape[0] + pos_ids[offset : offset + num_tokens] = coords + offset += num_tokens + + embeddings = freq_table[pos_ids] # lookup rotary embeddings + embeddings = embeddings.flatten(1) + return embeddings + + def fast_pos_embed_interpolate(self, grid_thw): + grid_ts, grid_hs, grid_ws = grid_thw[:, 0], grid_thw[:, 1], grid_thw[:, 2] + + idx_list = [[] for _ in range(4)] + weight_list = [[] for _ in range(4)] + + for t, h, w in zip(grid_ts, grid_hs, grid_ws): + h_idxs = torch.linspace(0, self.num_grid_per_side - 1, h) + w_idxs = torch.linspace(0, self.num_grid_per_side - 1, w) + + h_idxs_floor = h_idxs.int() + w_idxs_floor = w_idxs.int() + h_idxs_ceil = (h_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + w_idxs_ceil = (w_idxs.int() + 1).clip(max=self.num_grid_per_side - 1) + + dh = h_idxs - h_idxs_floor + dw = w_idxs - w_idxs_floor + + base_h = h_idxs_floor * self.num_grid_per_side + base_h_ceil = h_idxs_ceil * self.num_grid_per_side + + indices = [ + (base_h[None].T + w_idxs_floor[None]).flatten(), + (base_h[None].T + w_idxs_ceil[None]).flatten(), + (base_h_ceil[None].T + w_idxs_floor[None]).flatten(), + (base_h_ceil[None].T + w_idxs_ceil[None]).flatten(), + ] + + weights = [ + ((1 - dh)[None].T * (1 - dw)[None]).flatten(), + ((1 - dh)[None].T * dw[None]).flatten(), + (dh[None].T * (1 - dw)[None]).flatten(), + (dh[None].T * dw[None]).flatten(), + ] + + for i in range(4): + idx_list[i].extend(indices[i].tolist()) + weight_list[i].extend(weights[i].tolist()) + + idx_tensor = torch.tensor(idx_list) + weight_tensor = torch.tensor(weight_list) + pos_embeds = torch.from_numpy(self.vision_embeddings_pos(idx_tensor)[0]) * weight_tensor[:, :, None] + patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3] + + patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)]) + + patch_pos_embeds_permute = [] + merge_size = self.config.vision_config.spatial_merge_size + for pos_embed, t, h, w in zip(patch_pos_embeds, grid_ts, grid_hs, grid_ws): + pos_embed = pos_embed.repeat(t, 1) + pos_embed = ( + pos_embed.view(t, h // merge_size, merge_size, w // merge_size, merge_size, -1) + .permute(0, 1, 3, 2, 4, 5) + .flatten(0, 4) + ) + patch_pos_embeds_permute.append(pos_embed) + patch_pos_embeds = torch.cat(patch_pos_embeds_permute) + return patch_pos_embeds + + + def get_placeholder_mask( + self, + input_ids: torch.LongTensor, + inputs_embeds: torch.FloatTensor, + image_features: Optional[torch.FloatTensor] = None, + video_features: Optional[torch.FloatTensor] = None, + ): + """ + Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is + equal to the length of multimodal features. If the lengths are different, an error is raised. + """ + if input_ids is None: + special_image_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_image_mask = special_image_mask.all(-1) + special_video_mask = inputs_embeds == self.get_input_embeddings()( + torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device) + ) + special_video_mask = special_video_mask.all(-1) + else: + special_image_mask = input_ids == self.config.image_token_id + special_video_mask = input_ids == self.config.video_token_id + + n_image_tokens = special_image_mask.sum() + special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel(): + raise ValueError( + f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.shape[0]}" + ) + + n_video_tokens = special_video_mask.sum() + special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device) + if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel(): + raise ValueError( + f"Videos features and video tokens do not match: tokens: {n_video_tokens}, features {video_features.shape[0]}" + ) + + return special_image_mask, special_video_mask + + + def get_vision_embeddings(self, pixel_values, grid_thw, **kwargs): + hidden_states = torch.from_numpy(self.vision_embeddings(pixel_values)[0]) + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + hidden_states = hidden_states + pos_embeds + + rotary_pos_emb = self.rot_pos_emb(grid_thw) + seq_len, _ = hidden_states.size() + hidden_states = hidden_states.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, dtype=torch.int32 + ) + cu_seqlens = torch.nn.functional.pad(cu_seqlens, (1, 0), value=0) + attention_mask = torch.zeros((1, hidden_states.shape[0], hidden_states.shape[0]), dtype=torch.bool) + causal_mask = torch.zeros_like(attention_mask, dtype=torch.float32) + for i in range(1, len(cu_seqlens)): + attention_mask[..., cu_seqlens[i - 1] : cu_seqlens[i], cu_seqlens[i - 1] : cu_seqlens[i]] = True + + causal_mask.masked_fill_(torch.logical_not(attention_mask), float("-inf")) + + res = self.vision_embeddings_merger( + pixel_values=hidden_states, attention_mask=causal_mask, rotary_pos_emb=rotary_pos_emb + ) + return res[0], res[1] + + + def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Optional[torch.LongTensor] = None): + """ + Encodes images into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned. + + Args: + pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): + The tensors corresponding to the input images. + image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*): + The temporal, height and width of feature shape of each image in LLM. + """ + # pixel_values = pixel_values.type(self.visual.dtype) + image_embeds, deepstack_image_embeds = self.get_vision_embeddings(pixel_values, image_grid_thw) + image_embeds, deepstack_image_embeds = torch.from_numpy(image_embeds), torch.from_numpy(deepstack_image_embeds) + deepstack_image_embeds = deepstack_image_embeds.tolist() + print(image_grid_thw.prod(-1)) + split_sizes = (image_grid_thw.prod(-1) // self.spatial_merge_size**2).tolist() + print(image_embeds.shape) + image_embeds = torch.split(image_embeds, split_sizes) + return image_embeds, deepstack_image_embeds + + + def get_video_features( + self, pixel_values_videos: torch.FloatTensor, video_grid_thw: Optional[torch.LongTensor] = None + ): + """ + Encodes videos into continuous embeddings that can be forwarded to the language model. The deepstack visual features are also returned. + + Args: + pixel_values_videos (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`): + The tensors corresponding to the input videos. + video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): + The temporal, height and width of feature shape of each video in LLM. + """ + pixel_values_videos = pixel_values_videos.type(self.visual.dtype) + video_embeds = self.get_vision_embeddings(pixel_values_videos, video_grid_thw) + video_embeds, deepstack_video_embeds = torch.from_numpy(video_embeds[0]), torch.from_numpy(video_embeds[1]) + split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist() + video_embeds = torch.split(video_embeds, split_sizes) + return video_embeds, deepstack_video_embeds + + def get_multimodal_embeddings( + self, + input_ids, + pixel_values=None, + attention_mask=None, + position_ids=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + cache_position=None, + **kwargs, + ): + image_mask = None + video_mask = None + inputs_embeds = torch.from_numpy(self.get_text_embeddings(input_ids)) + + if pixel_values is not None: + image_embeds, deepstack_image_embeds = self.get_image_features(pixel_values, image_grid_thw) + image_embeds = torch.cat(image_embeds, dim=0) + image_mask, _ = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds + ) + inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) + + if pixel_values_videos is not None: + video_embeds, deepstack_video_embeds = self.get_video_features(pixel_values_videos, video_grid_thw) + video_embeds = torch.cat(video_embeds, dim=0) + _, video_mask = self.get_placeholder_mask( + input_ids, inputs_embeds=inputs_embeds, video_features=video_embeds + ) + inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds) + + visual_pos_masks = None + deepstack_visual_embeds = None + if image_mask is not None and video_mask is not None: + # aggregate visual_pos_masks and deepstack_visual_embeds + image_mask = image_mask[..., 0] + video_mask = video_mask[..., 0] + visual_pos_masks = image_mask | video_mask + deepstack_visual_embeds = [] + image_mask_joint = image_mask[visual_pos_masks] + video_mask_joint = video_mask[visual_pos_masks] + for img_embed, vid_embed in zip(deepstack_image_embeds, deepstack_video_embeds): + embed_joint = img_embed.new_zeros(visual_pos_masks.sum(), img_embed.shape[-1]).to(img_embed.device) + embed_joint[image_mask_joint, :] = img_embed + embed_joint[video_mask_joint, :] = vid_embed + deepstack_visual_embeds.append(embed_joint) + elif image_mask is not None: + image_mask = image_mask[..., 0] + visual_pos_masks = image_mask + deepstack_visual_embeds = deepstack_image_embeds + elif video_mask is not None: + video_mask = video_mask[..., 0] + visual_pos_masks = video_mask + deepstack_visual_embeds = deepstack_video_embeds + + if position_ids is None: + + # Calculate RoPE index once per generation in the pre-fill stage only. + # When compiling, we can't check tensor values thus we check only input length + # It is safe to assume that `length!=1` means we're in pre-fill because compiled + # models currently cannot do asssisted decoding + if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2): + # calculate RoPE index once per generation in the pre-fill stage only + if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None: + position_ids, rope_deltas = self.get_rope_index( + input_ids, image_grid_thw, video_grid_thw, attention_mask + ) + self.rope_deltas = rope_deltas + # then use the prev pre-calculated rope-deltas to get the correct position ids + else: + batch_size, seq_length, _ = inputs_embeds.shape + delta = ( + (cache_position[0] + self.rope_deltas).to(inputs_embeds.device) + if cache_position is not None + else 0 + ) + position_ids = torch.arange(seq_length, device=inputs_embeds.device) + position_ids = position_ids.view(1, -1).expand(batch_size, -1) + if cache_position is not None: # otherwise `deltas` is an int `0` + delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0) + position_ids = position_ids.add(delta) + position_ids = position_ids.unsqueeze(0).expand(3, -1, -1) + return inputs_embeds, attention_mask, position_ids, visual_pos_masks, deepstack_visual_embeds + + @staticmethod + def preprocess_inputs( + text: str, + image: Optional["Image"] = None, + processor: Optional[AutoImageProcessor] = None, + tokenizer: Optional[PreTrainedTokenizer] = None, + config: Optional[PretrainedConfig] = None, + video: Optional["VideoInput"] = None, + audio: Optional[np.ndarray] = None, + ): + if processor is None: + raise ValueError("Processor is required.") + if audio is not None: + raise ValueError("Audio input is not supported") + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": text}, + ], + } + ] + if image is not None: + conversation[0]["content"].insert(0, {"type": "image"}) + if video is not None: + conversation[0]["content"].insert(0, {"type": "video"}) + + text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + + inputs = processor(images=image, text=text_prompt, videos=video, return_tensors="pt") + return inputs + + + def forward( + self, + input_ids, + pixel_values=None, + past_key_values=None, + inputs_embeds=None, + image_sizes=None, + attention_mask=None, + position_ids=None, + image_bound=None, + tgt_sizes=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + rope_deltas=None, + **kwargs, + ): + result = super().forward( + input_ids, + pixel_values, + past_key_values, + inputs_embeds, + image_sizes, + attention_mask, + position_ids, + image_bound, + tgt_sizes, + pixel_values_videos, + image_grid_thw, + video_grid_thw, + rope_deltas, + **kwargs, + ) + final_result = QWen2VLModelOutputWithPast( + logits=result.logits, past_key_values=result.past_key_values, rope_deltas=rope_deltas + ) + return final_result + + class _OVMaira2ForCausalLM(_OVLlavaForCausalLM): @staticmethod def preprocess_inputs( @@ -4349,6 +5013,7 @@ def preprocess_inputs( "internvl_chat": _OVInternVLForCausalLM, "qwen2_vl": _OVQwen2VLForCausalLM, "qwen2_5_vl": _OVQwen2_5_VLForCausalLM, + "qwen3_vl": _OVQwen3VLForCausalLM, "got_ocr2": _OVGotOCR2ForCausalLM, "gemma3": _OVGemma3ForCausalLM, "idefics3": _OVIdefics3ForCausalLM, From b47cc60330f55e5e1d0290706048950ef9412f45 Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Sat, 13 Sep 2025 17:05:39 +0800 Subject: [PATCH 02/44] Update setup.py --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 4af8f58123..1fc9c262af 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "optimum==1.27.*", - "transformers>=4.36,<4.54", + "optimum==", + "transformers>=4.36", "datasets>=1.4.0", "setuptools", "scipy", From 8654a5367039883d3ac968f1e1367a28ce4e9af3 Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Sat, 13 Sep 2025 17:10:48 +0800 Subject: [PATCH 03/44] Update modeling_visual_language.py --- optimum/intel/openvino/modeling_visual_language.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 555991d974..afbeac0fd7 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -3755,7 +3755,7 @@ def fast_pos_embed_interpolate(self, grid_thw): idx_tensor = torch.tensor(idx_list) weight_tensor = torch.tensor(weight_list) - pos_embeds = torch.from_numpy(self.vision_embeddings_pos(idx_tensor)[0]) * weight_tensor[:, :, None] + pos_embeds = torch.from_numpy(self.vision_embeddings_pos(idx_tensor)) * weight_tensor[:, :, None] patch_pos_embeds = pos_embeds[0] + pos_embeds[1] + pos_embeds[2] + pos_embeds[3] patch_pos_embeds = patch_pos_embeds.split([h * w for h, w in zip(grid_hs, grid_ws)]) From e1f75c327fe1113fae2a2c35313a7731a5148806 Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Sat, 13 Sep 2025 17:11:48 +0800 Subject: [PATCH 04/44] Update model_patcher.py --- optimum/exporters/openvino/model_patcher.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 74f1540c05..e1c2f596cc 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4362,8 +4362,10 @@ def lm_forward(self, attention_mask, position_ids, past_key_values, inputs_embed deepstack_visual_embeds=deepstack_visual_embeds, ) hidden_states = outputs[0] + logits_to_keep = 1 # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - logits = self.lm_head(hidden_states) + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + logits = self.lm_head(hidden_states[:, slice_indices, :]) return (logits, outputs.past_key_values.to_legacy_cache()) model.__orig_forward = model.forward From d26021655adeffd242f641ec53a2e5873dfde585 Mon Sep 17 00:00:00 2001 From: ethan Date: Sat, 13 Sep 2025 19:57:05 -0700 Subject: [PATCH 05/44] update --- optimum/exporters/openvino/model_configs.py | 3 +- optimum/exporters/openvino/model_patcher.py | 3 +- .../openvino/modeling_visual_language.py | 34 ++----------------- 3 files changed, 4 insertions(+), 36 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ed63baca1c..698bc3a37a 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -3819,7 +3819,7 @@ class Qwen3_VLOpenVINOConfig(BaseVLMOpenVINOConfig): SUPPORTED_BEHAVIORS = [model_type.value for model_type in Qwen3VLConfigBehavior] NORMALIZED_CONFIG_CLASS = NormalizedVisionConfig DUMMY_INPUT_GENERATOR_CLASSES = (DummyQwen3VLVisionEmbedInputGenerator,) - MIN_TRANSFORMERS_VERSION = version.parse("4.45.0") + MIN_TRANSFORMERS_VERSION = version.parse("4.56.0") def __init__( self, @@ -3966,7 +3966,6 @@ def inputs(self) -> Dict[str, Dict[int, str]]: } - @property def outputs(self) -> Dict[str, Dict[int, str]]: if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS: diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index e1c2f596cc..1aee58b255 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4362,9 +4362,8 @@ def lm_forward(self, attention_mask, position_ids, past_key_values, inputs_embed deepstack_visual_embeds=deepstack_visual_embeds, ) hidden_states = outputs[0] - logits_to_keep = 1 # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + slice_indices = slice(-1, None) logits = self.lm_head(hidden_states[:, slice_indices, :]) return (logits, outputs.past_key_values.to_legacy_cache()) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index afbeac0fd7..a22286180e 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -198,15 +198,13 @@ def prepare_inputs( if visual_pos_masks is not None: inputs["visual_pos_masks"] = visual_pos_masks else: - inputs["visual_pos_masks"] = torch.ones(1, 1, dtype=torch.bool) + inputs["visual_pos_masks"] = torch.zeros(1, 1, dtype=torch.bool) if "deepstack_visual_embeds" in self.input_names: if isinstance(deepstack_visual_embeds, list): inputs["deepstack_visual_embeds"] = torch.Tensor(deepstack_visual_embeds) else: - inputs["deepstack_visual_embeds"] = torch.ones((3, 1, 1), dtype=torch.float32) - print(inputs["deepstack_visual_embeds"].shape) - + inputs["deepstack_visual_embeds"] = torch.zeros((3, 1, 1), dtype=torch.float32) if "token_type_ids" in self.input_names: if token_type_ids is None: token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int) @@ -216,11 +214,6 @@ def prepare_inputs( inputs["beam_idx"] = ( self.next_beam_idx if self.next_beam_idx is not None else np.arange(batch_size, dtype=int) ) - for key, value in inputs.items(): - if hasattr(value, 'dtype'): - print(f"{key}: {value.dtype}") - else: - print(f"{key}: {type(value)}") return inputs def forward( @@ -2549,27 +2542,6 @@ class QWen2VLModelOutputWithPast(ModelOutput): rope_deltas: Optional[torch.FloatTensor] = None second_per_grid_ts: Optional[torch.FloatTensor] = None - -# @dataclass -# class QWen3VLModelOutputWithPast(ModelOutput): -# r""" -# past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): -# Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape -# `(batch_size, num_heads, sequence_length, embed_size_per_head)`) - -# Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see -# `past_key_values` input) to speed up sequential decoding. -# rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*): -# The rope index difference between sequence length and multimodal rope. -# """ - -# last_hidden_state: Optional[torch.FloatTensor] = None -# past_key_values: Optional[list[torch.FloatTensor]] = None -# hidden_states: Optional[tuple[torch.FloatTensor]] = None -# attentions: Optional[tuple[torch.FloatTensor]] = None -# rope_deltas: Optional[torch.LongTensor] = None - - class _OVQwen2VLForCausalLM(OVModelForVisualCausalLM): additional_parts = ["vision_embeddings_merger"] @@ -3855,9 +3827,7 @@ def get_image_features(self, pixel_values: torch.FloatTensor, image_grid_thw: Op image_embeds, deepstack_image_embeds = self.get_vision_embeddings(pixel_values, image_grid_thw) image_embeds, deepstack_image_embeds = torch.from_numpy(image_embeds), torch.from_numpy(deepstack_image_embeds) deepstack_image_embeds = deepstack_image_embeds.tolist() - print(image_grid_thw.prod(-1)) split_sizes = (image_grid_thw.prod(-1) // self.spatial_merge_size**2).tolist() - print(image_embeds.shape) image_embeds = torch.split(image_embeds, split_sizes) return image_embeds, deepstack_image_embeds From 047e30b4d80e124cb1316e7a339338acc4d50fc4 Mon Sep 17 00:00:00 2001 From: ethan Date: Sun, 14 Sep 2025 07:07:07 -0700 Subject: [PATCH 06/44] set to static shape --- optimum/exporters/openvino/model_configs.py | 37 ++++++++++++++++--- optimum/exporters/openvino/utils.py | 1 + .../openvino/modeling_visual_language.py | 4 +- setup.py | 2 +- 4 files changed, 37 insertions(+), 7 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 698bc3a37a..ef34d38a3b 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -349,13 +349,40 @@ class DummyQwen3VLLMInputGenerator(DummyTextInputGenerator): "visual_pos_masks", "deepstack_visual_embeds", ) - + + def __init__( + self, + task: str, + normalized_config: NormalizedTextConfig, + batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], + sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], + num_choices: int = DEFAULT_DUMMY_SHAPES["num_choices"], + random_batch_size_range: Optional[Tuple[int, int]] = None, + random_sequence_length_range: Optional[Tuple[int, int]] = None, + random_num_choices_range: Optional[Tuple[int, int]] = None, + padding_side: str = "right", + **kwargs, + ): + super().__init__( + task=task, + normalized_config=normalized_config, + batch_size=batch_size, + sequence_length=sequence_length, + num_choices=num_choices, + random_batch_size_range=random_batch_size_range, + random_sequence_length_range=random_sequence_length_range, + random_num_choices_range=random_num_choices_range, + padding_side=padding_side, + **kwargs, + ) + self.embed_dim = normalized_config.hidden_size + def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32", bool_dtype: str = "bool"): if input_name == "deepstack_visual_embeds": - return self.random_float_tensor([3, 32, 2560], framework=framework, dtype=float_dtype) + return self.random_float_tensor([3, 2*self.sequence_length, self.embed_dim], framework=framework, dtype=float_dtype) if input_name == "visual_pos_masks": return self.constant_tensor( - shape=[self.batch_size, 16], + shape=[self.batch_size, self.sequence_length], framework=framework, value=1, dtype=DTYPE_MAPPER.pt(bool_dtype), @@ -381,7 +408,7 @@ class Qwen3VLTextOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): def inputs(self) -> Dict[str, Dict[int, str]]: common_inputs = super().inputs common_inputs["visual_pos_masks"] = {0: "batch_size", 1: "sequence_length"} - common_inputs["deepstack_visual_embeds"] = {0: "num_layers", 1: "visual_seqlen", 2: "embed_dim"} + common_inputs["deepstack_visual_embeds"] = {0: "num_layers", 1: "visual_seqlen"} return common_inputs def patch_model_for_export( @@ -3962,7 +3989,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: } if self._behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS: return { - "input": {0: "sequence_length", 1: "sequence_length"}, + "input": {1: "sequence_length"}, } diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index d1318fc109..0ce8c2fd3e 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -228,6 +228,7 @@ def get_submodels(model): "phi3_v", "qwen2_vl", "qwen2_5_vl", + "qwen3_vl", "got_ocr2", "gemma3", "idefics3", diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index a22286180e..bac9e15857 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -201,10 +201,12 @@ def prepare_inputs( inputs["visual_pos_masks"] = torch.zeros(1, 1, dtype=torch.bool) if "deepstack_visual_embeds" in self.input_names: + num_layers = len(self.config.vision_config.deepstack_visual_indexes) + emd_dim = self.config.text_config.hidden_size if isinstance(deepstack_visual_embeds, list): inputs["deepstack_visual_embeds"] = torch.Tensor(deepstack_visual_embeds) else: - inputs["deepstack_visual_embeds"] = torch.zeros((3, 1, 1), dtype=torch.float32) + inputs["deepstack_visual_embeds"] = torch.zeros((num_layers, 1, emd_dim), dtype=torch.float32) if "token_type_ids" in self.input_names: if token_type_ids is None: token_type_ids = np.zeros(inputs_embeds.shape[:2], dtype=int) diff --git a/setup.py b/setup.py index 1fc9c262af..622d276b71 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=1.11", - "optimum==", + "optimum", "transformers>=4.36", "datasets>=1.4.0", "setuptools", From 6c88fbf9fd5b26c54fff89ed2ef2d65dcb0d97e9 Mon Sep 17 00:00:00 2001 From: ethan Date: Sun, 14 Sep 2025 21:19:40 -0700 Subject: [PATCH 07/44] add qwen3vl_moe support --- optimum/exporters/openvino/model_configs.py | 75 ++++++++++++++++++- optimum/exporters/openvino/utils.py | 1 + .../openvino/modeling_visual_language.py | 6 +- 3 files changed, 78 insertions(+), 4 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index ef34d38a3b..628cac0f7e 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -146,7 +146,6 @@ XverseModelPatcher, ) - def init_model_configs(): if "open_clip" not in TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES: TasksManager._LIBRARY_TO_SUPPORTED_MODEL_TYPES["open_clip"] = {} @@ -170,6 +169,10 @@ def init_model_configs(): "transformers", "AutoModelForImageTextToText", ) + TasksManager._CUSTOM_CLASSES[("pt", "qwen3_vl_moe", "image-text-to-text")] = ( + "transformers", + "AutoModelForImageTextToText", + ) TasksManager._CUSTOM_CLASSES[("pt", "llava_next_video", "image-text-to-text")] = ( "transformers", "AutoModelForVision2Seq", @@ -397,6 +400,14 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int ], library_name="transformers", ) +@register_in_tasks_manager( + "qwen3_vl_moe_text", + *[ + "text-generation", + "text-generation-with-past", + ], + library_name="transformers", +) class Qwen3VLTextOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): MIN_TRANSFORMERS_VERSION = "4.56.0" @@ -4004,6 +4015,68 @@ def outputs(self) -> Dict[str, Dict[int, str]]: return {} +@register_in_tasks_manager( + "qwen3_vl_moe", + *["image-text-to-text", "video-text-to-text"], + library_name="transformers", +) +class Qwen3_VL_MOEOpenVINOConfig(Qwen3_VLOpenVINOConfig): + def with_behavior( + self, + behavior: Union[str, Qwen3VLConfigBehavior], + ): + """ + Creates a config for different behaviour. + Args: + behavior ([`ConfigBehavior`]): + The behavior to use for the new instance. + """ + if isinstance(behavior, str) and not isinstance(behavior, Qwen3VLConfigBehavior): + behavior = Qwen3VLConfigBehavior(behavior) + + if behavior == Qwen3VLConfigBehavior.TEXT_EMBEDDINGS: + return get_vlm_text_embeddings_config("qwen3_vl_moe_text", self._orig_config.text_config, self.int_dtype, self.float_dtype) + + if behavior == Qwen3VLConfigBehavior.LANGUAGE: + return get_vlm_text_generation_config( + "qwen3_vl_moe_text", + self._orig_config.text_config, + self.int_dtype, + self.float_dtype, + model_patcher=Qwen3VLLanguageModelPatcher, + dummy_input_generator=DummyQwen2VLLMInputGenerator, + inputs_update={"position_ids": {1: "batch_size", 2: "sequence_length"}}, + ) + + if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_MERGER: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + if behavior == Qwen3VLConfigBehavior.VISION_EMBEDDINGS_POS: + return self.__class__( + self._orig_config, + task=self.task, + int_dtype=self.int_dtype, + float_dtype=self.float_dtype, + behavior=behavior, + preprocessors=self._preprocessors, + ) + + @register_in_tasks_manager( "glm", *[ diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 0ce8c2fd3e..5d04ad3585 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -229,6 +229,7 @@ def get_submodels(model): "qwen2_vl", "qwen2_5_vl", "qwen3_vl", + "qwen3_vl_moe", "got_ocr2", "gemma3", "idefics3", diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index bac9e15857..a57c2d34ae 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -189,7 +189,7 @@ def prepare_inputs( if past_len: position_ids = position_ids[:, -inputs_embeds.shape[1] :] - if (self.config.model_type == "qwen2_vl" or self.config.model_type == "qwen3_vl") and position_ids.ndim != 3: + if (self.config.model_type == "qwen2_vl" or self.config.model_type == "qwen3_vl" or self.config.model_type == "qwen3_vl_moe") and position_ids.ndim != 3: position_ids = np.repeat(np.expand_dims(position_ids, 0), 3, axis=0) inputs["position_ids"] = position_ids @@ -230,7 +230,6 @@ def forward( **kwargs, ): self.compile() - inputs = self.prepare_inputs( input_ids=input_ids, attention_mask=attention_mask, @@ -787,7 +786,7 @@ def forward( ): if pixel_values is None: pixel_values = images if images is not None else image_pixel_values - if self.config.model_type == "qwen3_vl": + if self.config.model_type == "qwen3_vl" or self.config.model_type == "qwen3_vl_moe": inputs_embeds, attention_mask, position_ids, visual_pos_masks, deepstack_visual_embeds = self.get_multimodal_embeddings( input_ids, pixel_values, @@ -4986,6 +4985,7 @@ def preprocess_inputs( "qwen2_vl": _OVQwen2VLForCausalLM, "qwen2_5_vl": _OVQwen2_5_VLForCausalLM, "qwen3_vl": _OVQwen3VLForCausalLM, + "qwen3_vl_moe": _OVQwen3VLForCausalLM, "got_ocr2": _OVGotOCR2ForCausalLM, "gemma3": _OVGemma3ForCausalLM, "idefics3": _OVIdefics3ForCausalLM, From a2c7350492553cbecbe33a2c2079a8844306cf27 Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Fri, 19 Sep 2025 09:25:14 +0800 Subject: [PATCH 08/44] Update modeling_visual_language.py --- optimum/intel/openvino/modeling_visual_language.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index a57c2d34ae..236148a85f 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -3845,10 +3845,9 @@ def get_video_features( video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*): The temporal, height and width of feature shape of each video in LLM. """ - pixel_values_videos = pixel_values_videos.type(self.visual.dtype) video_embeds = self.get_vision_embeddings(pixel_values_videos, video_grid_thw) video_embeds, deepstack_video_embeds = torch.from_numpy(video_embeds[0]), torch.from_numpy(video_embeds[1]) - split_sizes = (video_grid_thw.prod(-1) // self.visual.spatial_merge_size**2).tolist() + split_sizes = (video_grid_thw.prod(-1) // self.spatial_merge_size**2).tolist() video_embeds = torch.split(video_embeds, split_sizes) return video_embeds, deepstack_video_embeds From c7b2d280ff8c86549335ea84ab4cf12c7913f068 Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Mon, 22 Sep 2025 18:29:47 +0800 Subject: [PATCH 09/44] Update modeling_visual_language.py --- .../openvino/modeling_visual_language.py | 24 +++++++++++++------ 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index 236148a85f..fdc435fd66 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -3908,18 +3908,28 @@ def get_multimodal_embeddings( deepstack_visual_embeds = deepstack_video_embeds if position_ids is None: + attention_mask_tensor = ( + attention_mask if not isinstance(attention_mask, dict) else attention_mask["full_attention"] + ) + if attention_mask_tensor is not None and attention_mask_tensor.ndim == 4: + attention_mask_tensor = torch.diagonal(attention_mask_tensor[:, 0], dim1=1, dim2=2) + # Only apply conversion for floating point tensors (inverted masks) + if attention_mask_tensor.dtype.is_floating_point: + attention_mask_tensor = attention_mask_tensor / torch.finfo(attention_mask_tensor.dtype).min + attention_mask_tensor = (1.0 - attention_mask_tensor).int() # Calculate RoPE index once per generation in the pre-fill stage only. # When compiling, we can't check tensor values thus we check only input length # It is safe to assume that `length!=1` means we're in pre-fill because compiled # models currently cannot do asssisted decoding - if position_ids is None and input_ids is not None and (attention_mask is None or attention_mask.ndim == 2): - # calculate RoPE index once per generation in the pre-fill stage only - if (cache_position is not None and cache_position[0] == 0) or self.rope_deltas is None: - position_ids, rope_deltas = self.get_rope_index( - input_ids, image_grid_thw, video_grid_thw, attention_mask - ) - self.rope_deltas = rope_deltas + if self.rope_deltas is None: + position_ids, rope_deltas = self.get_rope_index( + input_ids, + image_grid_thw, + video_grid_thw, + attention_mask=attention_mask_tensor, + ) + self.rope_deltas = rope_deltas # then use the prev pre-calculated rope-deltas to get the correct position ids else: batch_size, seq_length, _ = inputs_embeds.shape From 9b76446a5ff22296b57dc4317c24e8c41f1a5723 Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Mon, 22 Sep 2025 18:31:36 +0800 Subject: [PATCH 10/44] Update model_patcher.py --- optimum/exporters/openvino/model_patcher.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1aee58b255..74f1540c05 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4363,8 +4363,7 @@ def lm_forward(self, attention_mask, position_ids, past_key_values, inputs_embed ) hidden_states = outputs[0] # Only compute necessary logits, and do not upcast them to float if we are not computing the loss - slice_indices = slice(-1, None) - logits = self.lm_head(hidden_states[:, slice_indices, :]) + logits = self.lm_head(hidden_states) return (logits, outputs.past_key_values.to_legacy_cache()) model.__orig_forward = model.forward From 8e7cdd259c2466ce2015c48922a3b63cd3fbc0fc Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Tue, 23 Sep 2025 15:02:13 +0800 Subject: [PATCH 11/44] Update modeling_visual_language.py --- optimum/intel/openvino/modeling_visual_language.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index fdc435fd66..f6f2072fa3 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -979,6 +979,13 @@ def preprocess_inputs( Preprocess input instruction and an image. """ + # modified from https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/generation/utils.py#L1992 + def _prepare_cache_for_generation(self, *args, **kwargs): + """ + This function is used to prepare the cache : when calling `generate` before the first inference, an instance of `DynamicCache` will be created. + For OVModel, we don't want model_kwargs to be updated before generation. + """ + return class _OVLlavaForCausalLM(OVModelForVisualCausalLM): def __init__( From 3cb4e20ea17aa8369531e2605f204284025d6588 Mon Sep 17 00:00:00 2001 From: ethan Date: Sun, 28 Sep 2025 21:59:01 -0700 Subject: [PATCH 12/44] Revert "Update modeling_visual_language.py" This reverts commit 8e7cdd259c2466ce2015c48922a3b63cd3fbc0fc. --- optimum/intel/openvino/modeling_visual_language.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index f6f2072fa3..fdc435fd66 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -979,13 +979,6 @@ def preprocess_inputs( Preprocess input instruction and an image. """ - # modified from https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/generation/utils.py#L1992 - def _prepare_cache_for_generation(self, *args, **kwargs): - """ - This function is used to prepare the cache : when calling `generate` before the first inference, an instance of `DynamicCache` will be created. - For OVModel, we don't want model_kwargs to be updated before generation. - """ - return class _OVLlavaForCausalLM(OVModelForVisualCausalLM): def __init__( From 741501ef4bb57f29e22ef0695f77f2ba67a6c16e Mon Sep 17 00:00:00 2001 From: Ethan Yang Date: Mon, 29 Sep 2025 16:17:10 +0800 Subject: [PATCH 13/44] Update modeling_visual_language.py --- optimum/intel/openvino/modeling_visual_language.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index fdc435fd66..f6f2072fa3 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -979,6 +979,13 @@ def preprocess_inputs( Preprocess input instruction and an image. """ + # modified from https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/generation/utils.py#L1992 + def _prepare_cache_for_generation(self, *args, **kwargs): + """ + This function is used to prepare the cache : when calling `generate` before the first inference, an instance of `DynamicCache` will be created. + For OVModel, we don't want model_kwargs to be updated before generation. + """ + return class _OVLlavaForCausalLM(OVModelForVisualCausalLM): def __init__( From 02f9c501a551cb0cc37d5f5db5ac83bda8e4056d Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 26 Nov 2025 12:14:50 +0100 Subject: [PATCH 14/44] transformers 4.57 --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 2c2add705f..c4a294ee40 100644 --- a/setup.py +++ b/setup.py @@ -28,8 +28,8 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx==0.0.*", - "transformers>=4.45,<4.56", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-4.57", + "transformers>=4.45,<4.58", "setuptools", ] From c68919f1d34f613922039edbe9494245a798e86d Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Wed, 26 Nov 2025 12:40:31 +0100 Subject: [PATCH 15/44] patch dynamic cache layer --- optimum/exporters/openvino/model_patcher.py | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 7da3d8cb21..f09dcae3d1 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -206,6 +206,23 @@ def eager_mask_without_vmap(*args, **kwargs) -> Optional[torch.Tensor]: return mask +def patched_dynamic_layer_update( + self, + key_states: torch.Tensor, + value_states: torch.Tensor, + cache_kwargs: Optional[dict[str, Any]] = None, +) -> tuple[torch.Tensor, torch.Tensor]: + if self.keys is None: + self.keys = key_states + self.values = value_states + self.device, self.dtype = key_states.device, key_states.dtype + self.is_initialized = True + else: + self.keys = torch.cat([self.keys, key_states], dim=-2) + self.values = torch.cat([self.values, value_states], dim=-2) + return self.keys, self.values + + class OVDecoderModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() @@ -228,6 +245,12 @@ def __enter__(self): # non-stateful models on cpu and stateful models on npu ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap) + if is_transformers_version(">=", "4.56.0"): + from transformers.cache_utils import DynamicLayer + + self.original_dynamic_layer_update = DynamicLayer.update + DynamicLayer.update = patched_dynamic_layer_update + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -239,6 +262,11 @@ def __exit__(self, exc_type, exc_value, traceback): ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask) ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) + if is_transformers_version(">=", "4.56.0"): + from transformers.cache_utils import DynamicLayer + + DynamicLayer.update = self.original_dynamic_layer_update + def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, sequence_length, hidden_dim = hidden_states.shape From 073fc46466750fb12a83f112a48f00f49b7159f9 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 27 Nov 2025 11:49:43 +0100 Subject: [PATCH 16/44] fix qwen and gpt_oss --- optimum/exporters/openvino/model_configs.py | 3 +++ optimum/exporters/openvino/model_patcher.py | 20 ++++++++++++++++++++ tests/openvino/test_decoder.py | 4 +++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index cba7c547f0..3da0cc06d0 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -95,6 +95,7 @@ GptJModelPatcher, GptNeoModelPatcher, GptNeoxModelPatcher, + GptOssModelPatcher, GraniteMoEModelPatcher, IBertModelPatcher, Idefics3ImageEmbeddingsModelPatcher, @@ -575,6 +576,7 @@ class GptOssOpenVINOConfig(LlamaOpenVINOConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, GemmaDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = GemmaDummyPastKeyValuesGenerator MIN_TRANSFORMERS_VERSION = "4.55.1" + _MODEL_PATCHER = GptOssModelPatcher @register_in_tasks_manager( @@ -656,6 +658,7 @@ def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int @register_in_tasks_manager("qwen", *["text-generation", "text-generation-with-past"]) class QwenOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DEFAULT_ONNX_OPSET = 14 + MAX_TRANSFORMERS_VERSION = "4.55.4" NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_layers="num_hidden_layers", num_attention_heads="num_attention_heads", hidden_size="hidden_size" ) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f09dcae3d1..b83460d55c 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -31,6 +31,7 @@ from optimum.exporters.onnx.model_patcher import ( UNSUPPORTED_OPS_PATCHING_SPEC, ModelPatcher, + gpt_oss_forward, override_arguments, sdpa_mask_without_vmap, ) @@ -7203,3 +7204,22 @@ def __exit__(self, exc_type, exc_value, traceback): else: continue conv_layer.slow_forward = conv_layer._orig_forward + + +class GptOssModelPatcher(OVDecoderModelPatcher): + def __enter__(self): + super().__enter__() + + if is_transformers_version(">=", "4.55.0"): + from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts + + self.original_gpt_oss_forward = GptOssExperts.forward + GptOssExperts.forward = gpt_oss_forward + + def __exit__(self, exc_type, exc_value, traceback): + super().__exit__(exc_type, exc_value, traceback) + + if is_transformers_version(">=", "4.55.0"): + from transformers.models.gpt_oss.modeling_gpt_oss import GptOssExperts + + GptOssExperts.forward = self.original_gpt_oss_forward diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index ee9811d9ab..cfd918ff4a 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -54,7 +54,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "mpt", "opt", "pegasus", - "qwen", "phi", "internlm2", "orion", @@ -133,6 +132,9 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): if is_transformers_version(">=", "4.55.0"): SUPPORTED_ARCHITECTURES += ("gpt_oss", "gpt_oss_mxfp4") + if is_transformers_version("<", "4.56.0"): + SUPPORTED_ARCHITECTURES += ("qwen",) + GENERATION_LENGTH = 100 REMOTE_CODE_MODELS = ( "chatglm", From 5b245cfcc2097dd7d69b0b12f43572da9227306b Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 27 Nov 2025 14:12:08 +0100 Subject: [PATCH 17/44] fix seq2seq models as well --- optimum/exporters/openvino/model_patcher.py | 18 +++++++++++++----- tests/openvino/utils_tests.py | 2 +- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index b83460d55c..4f41859f32 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -41,6 +41,8 @@ if is_transformers_version(">=", "4.53"): from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock +if is_transformers_version("<", "4.56"): + from transformers.cache_utils import DynamicLayer if TYPE_CHECKING: @@ -59,6 +61,7 @@ "tril", "norm", "unfold", + "movedim", "rms_norm", "repeat_interleave", "scaled_dot_product_attention", @@ -216,7 +219,8 @@ def patched_dynamic_layer_update( if self.keys is None: self.keys = key_states self.values = value_states - self.device, self.dtype = key_states.device, key_states.dtype + self.device = key_states.device + self.dtype = key_states.dtype self.is_initialized = True else: self.keys = torch.cat([self.keys, key_states], dim=-2) @@ -247,8 +251,6 @@ def __enter__(self): ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap) if is_transformers_version(">=", "4.56.0"): - from transformers.cache_utils import DynamicLayer - self.original_dynamic_layer_update = DynamicLayer.update DynamicLayer.update = patched_dynamic_layer_update @@ -264,9 +266,8 @@ def __exit__(self, exc_type, exc_value, traceback): ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) if is_transformers_version(">=", "4.56.0"): - from transformers.cache_utils import DynamicLayer - DynamicLayer.update = self.original_dynamic_layer_update + DynamicLayer.lazy_initialization = self.original_lazy_initialization def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -4454,6 +4455,10 @@ def __enter__(self): # non-stateful models on cpu and stateful models on npu ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap) + if is_transformers_version(">=", "4.56.0"): + self.original_dynamic_layer_update = DynamicLayer.update + DynamicLayer.update = patched_dynamic_layer_update + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -4461,6 +4466,9 @@ def __exit__(self, exc_type, exc_value, traceback): ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask) ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) + if is_transformers_version(">=", "4.56.0"): + DynamicLayer.update = self.original_dynamic_layer_update + class SanaTextEncoderModelPatcher(ModelPatcher): def __enter__(self): diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 7bbed8ac2e..5f28aa6bc4 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -43,7 +43,7 @@ "bge": "optimum-intel-internal-testing/bge-small-en-v1.5", "beit": "optimum-intel-internal-testing/tiny-random-BeitForImageClassification", "bert": "optimum-intel-internal-testing/tiny-random-bert", - "bart": "optimum-intel-internal-testing/tiny-random-bart", + "bart": "hf-internal-testing/tiny-random-BartModel", "baichuan2": "optimum-intel-internal-testing/tiny-random-baichuan2", "baichuan2-13b": "optimum-intel-internal-testing/tiny-random-baichuan2-13b", "bigbird_pegasus": "optimum-intel-internal-testing/tiny-random-bigbird_pegasus", From 513977a982a0b2c0f293f0707685b61429edabbe Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 27 Nov 2025 14:36:57 +0100 Subject: [PATCH 18/44] fix --- optimum/exporters/openvino/model_configs.py | 5 +---- optimum/exporters/openvino/model_patcher.py | 14 +++++++------- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 3da0cc06d0..d67ac7ff86 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -4035,10 +4035,7 @@ def inputs(self) -> Dict[str, Dict[int, str]]: if self.variant == "with-past" and self.use_past_in_inputs: self.add_past_key_values(common_inputs, direction="inputs") elif self._behavior is SpeechT5ConfigBehavior.POSTNET: - common_inputs["raw_spectrogram"] = { - 0: "n_spectrums", - 1: "batch_size", - } + common_inputs["raw_spectrogram"] = {0: "n_spectrums", 1: "batch_size"} elif self._behavior is SpeechT5ConfigBehavior.VOCODER: common_inputs["spectrogram"] = {0: "batch_size", 1: "n_spectrums"} else: diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4f41859f32..f2a9dcfe39 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -41,7 +41,7 @@ if is_transformers_version(">=", "4.53"): from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock -if is_transformers_version("<", "4.56"): +if is_transformers_version(">=", "4.56"): from transformers.cache_utils import DynamicLayer @@ -250,7 +250,7 @@ def __enter__(self): # non-stateful models on cpu and stateful models on npu ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap) - if is_transformers_version(">=", "4.56.0"): + if is_transformers_version(">=", "4.56"): self.original_dynamic_layer_update = DynamicLayer.update DynamicLayer.update = patched_dynamic_layer_update @@ -261,11 +261,11 @@ def __exit__(self, exc_type, exc_value, traceback): self._model._update_causal_mask = self._model._update_causal_mask_original del self._model._update_causal_mask_original - if is_transformers_version(">=", "4.53.0"): + if is_transformers_version(">=", "4.53"): ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask) ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) - if is_transformers_version(">=", "4.56.0"): + if is_transformers_version(">=", "4.56"): DynamicLayer.update = self.original_dynamic_layer_update DynamicLayer.lazy_initialization = self.original_lazy_initialization @@ -4455,18 +4455,18 @@ def __enter__(self): # non-stateful models on cpu and stateful models on npu ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap) - if is_transformers_version(">=", "4.56.0"): + if is_transformers_version(">=", "4.56"): self.original_dynamic_layer_update = DynamicLayer.update DynamicLayer.update = patched_dynamic_layer_update def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.53.0"): + if is_transformers_version(">=", "4.53"): ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask) ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) - if is_transformers_version(">=", "4.56.0"): + if is_transformers_version(">=", "4.56"): DynamicLayer.update = self.original_dynamic_layer_update From 43d58427d7a5f793645ad7060a63152785579450 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 27 Nov 2025 15:31:42 +0100 Subject: [PATCH 19/44] fix --- optimum/exporters/openvino/model_patcher.py | 1 - 1 file changed, 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f2a9dcfe39..0081a8e4eb 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -267,7 +267,6 @@ def __exit__(self, exc_type, exc_value, traceback): if is_transformers_version(">=", "4.56"): DynamicLayer.update = self.original_dynamic_layer_update - DynamicLayer.lazy_initialization = self.original_lazy_initialization def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: From 79a0bbfbee9d251185682e175e4888eba4b6c3d6 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Thu, 27 Nov 2025 16:41:37 +0100 Subject: [PATCH 20/44] more decoder fixes --- optimum/exporters/openvino/model_configs.py | 97 ++++++++------------- optimum/exporters/openvino/model_patcher.py | 56 ++++++------ tests/openvino/test_decoder.py | 4 +- tests/openvino/test_modeling.py | 2 +- 4 files changed, 67 insertions(+), 92 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index d67ac7ff86..430481dd8f 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -33,6 +33,7 @@ FalconOnnxConfig, GemmaOnnxConfig, GPT2OnnxConfig, + GPTBigCodeOnnxConfig, GPTJOnnxConfig, GPTNeoOnnxConfig, GPTNeoXOnnxConfig, @@ -41,6 +42,7 @@ MarianOnnxConfig, MistralOnnxConfig, MPTOnnxConfig, + OPTOnnxConfig, PegasusOnnxConfig, PhiOnnxConfig, SpeechT5OnnxConfig, @@ -437,6 +439,7 @@ class ChatGLM2OpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, ChatGLM2DummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = ChatGLM2DummyPastKeyValuesGenerator _MODEL_PATCHER = ChatGLMModelPatcher + MAX_TRANSFORMERS_VERSION = "4.55.4" def generate_dummy_inputs(self, framework: str = "pt", **kwargs): dummy_inputs_generators = self._create_dummy_input_generator_classes(**kwargs) @@ -762,6 +765,7 @@ class OrionOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): DUMMY_INPUT_GENERATOR_CLASSES = (DummyTextInputGenerator, MistralDummyPastKeyValuesGenerator) DUMMY_PKV_GENERATOR_CLASS = MistralDummyPastKeyValuesGenerator NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + _MODEL_PATCHER = OVDecoderModelPatcher @register_in_tasks_manager("olmo", *["text-generation", "text-generation-with-past"], library_name="transformers") @@ -897,9 +901,37 @@ class PersimmonOpenVINOConfig(TextDecoderWithPositionIdsOnnxConfig): class BioGPTOpenVINOConfig( TextDecoderWithPositionIdsOnnxConfig if is_transformers_version(">=", "4.52.0") else TextDecoderOnnxConfig ): - # BioGPT does not require position_ids input. - DEFAULT_ONNX_OPSET = 13 NORMALIZED_CONFIG_CLASS = NormalizedTextConfig + _MODEL_PATCHER = OVDecoderModelPatcher + + +@register_in_tasks_manager( + "gpt_bigcode", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], +) +class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): + _MODEL_PATCHER = OVDecoderModelPatcher + + +@register_in_tasks_manager( + "opt", + *[ + "feature-extraction", + "feature-extraction-with-past", + "text-generation", + "text-generation-with-past", + "text-classification", + ], + library_name="transformers", +) +class OPTOpenVINOConfig(OPTOnnxConfig): + _MODEL_PATCHER = OVDecoderModelPatcher @register_in_tasks_manager( @@ -969,6 +1001,7 @@ class XGLMConfig(TextDecoderWithPositionIdsOnnxConfig): NORMALIZED_CONFIG_CLASS = NormalizedTextConfig.with_args( num_attention_heads="attention_heads", hidden_size="d_model" ) + _MODEL_PATCHER = OVDecoderModelPatcher class AquilaDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): @@ -3517,66 +3550,6 @@ class GraniteMoEOpenVINOConfig(LlamaOpenVINOConfig): _MODEL_PATCHER = GraniteMoEModelPatcher -# TODO: remove and replace with GPTBigCodeDummyPastKeyValuesGenerator when optimum >= v2 -class GPTBigCodeDummyPastKeyValuesGenerator(DummyPastKeyValuesGenerator): - def __init__( - self, - task: str, - normalized_config: NormalizedTextConfig, - batch_size: int = DEFAULT_DUMMY_SHAPES["batch_size"], - sequence_length: int = DEFAULT_DUMMY_SHAPES["sequence_length"], - random_batch_size_range: Optional[Tuple[int, int]] = None, - random_sequence_length_range: Optional[Tuple[int, int]] = None, - **kwargs, - ): - super().__init__( - task=task, - normalized_config=normalized_config, - batch_size=batch_size, - sequence_length=sequence_length, - random_batch_size_range=random_batch_size_range, - random_sequence_length_range=random_sequence_length_range, - **kwargs, - ) - self.multi_query = normalized_config.multi_query - - def generate(self, input_name: str, framework: str = "pt", int_dtype: str = "int64", float_dtype: str = "fp32"): - if is_transformers_version("<", "4.54"): - if self.multi_query: - shape = ( - self.batch_size, - self.sequence_length, - self.hidden_size // self.num_attention_heads * 2, - ) - else: - shape = ( - self.batch_size, - self.num_attention_heads, - self.sequence_length, - self.hidden_size // self.num_attention_heads * 2, - ) - pkv = [ - self.random_float_tensor(shape, framework=framework, dtype=float_dtype) for _ in range(self.num_layers) - ] - - else: - shape = ( - self.batch_size, - self.num_attention_heads if not self.multi_query else 1, - self.sequence_length, - self.hidden_size // self.num_attention_heads, - ) - pkv = [ - ( - self.random_float_tensor(shape, framework=framework, dtype=float_dtype), - self.random_float_tensor(shape, framework=framework, dtype=float_dtype), - ) - for _ in range(self.num_layers) - ] - - return pkv - - @register_in_tasks_manager( "whisper", *[ diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 0081a8e4eb..f51c5ccefe 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -2324,18 +2324,20 @@ class PersimmonModelPatcher(OVDecoderModelPatcher): def __enter__(self): super().__enter__() - for layer in self._model.model.layers: - if is_torch_version(">=", "2.1.0"): - orig_self_attn_fwd = layer.self_attn.forward - layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn) - layer.self_attn._orig_forward = orig_self_attn_fwd + if is_transformers_version("<", "4.56"): + for layer in self._model.model.layers: + if is_torch_version(">=", "2.1.0"): + orig_self_attn_fwd = layer.self_attn.forward + layer.self_attn.forward = types.MethodType(_persimmon_self_attn_sdpa_forward, layer.self_attn) + layer.self_attn._orig_forward = orig_self_attn_fwd def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - for layer in self._model.model.layers: - if hasattr(layer.self_attn, "_orig_forward"): - layer.self_attn.forward = layer.self_attn._orig_forward + if is_transformers_version("<", "4.56"): + for layer in self._model.model.layers: + if hasattr(layer.self_attn, "_orig_forward"): + layer.self_attn.forward = layer.self_attn._orig_forward def _jais_attn_forward( @@ -5097,47 +5099,49 @@ def modulewise_unpatch(model, module_cls): class BlenderbotModelPatcher(OVSeq2SeqModelPatcher): def __enter__(self): super().__enter__() - from transformers.models.blenderbot.modeling_blenderbot import BlenderbotAttention + if is_transformers_version("<", "4.56"): + from transformers.models.blenderbot.modeling_blenderbot import BlenderbotAttention - modulewise_patch(self._model, BlenderbotAttention, _blenderbot_attn_forward) + modulewise_patch(self._model, BlenderbotAttention, _blenderbot_attn_forward) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - from transformers.models.blenderbot.modeling_blenderbot import BlenderbotAttention + if is_transformers_version("<", "4.56"): + from transformers.models.blenderbot.modeling_blenderbot import BlenderbotAttention - modulewise_unpatch(self._model, BlenderbotAttention) + modulewise_unpatch(self._model, BlenderbotAttention) class BlenderbotSmallModelPatcher(OVSeq2SeqModelPatcher): def __enter__(self): super().__enter__() + if is_transformers_version("<", "4.56"): + from transformers.models.blenderbot_small.modeling_blenderbot_small import BlenderbotSmallAttention - from transformers.models.blenderbot_small.modeling_blenderbot_small import BlenderbotSmallAttention - - modulewise_patch(self._model, BlenderbotSmallAttention, _blenderbot_attn_forward) + modulewise_patch(self._model, BlenderbotSmallAttention, _blenderbot_attn_forward) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) + if is_transformers_version("<", "4.56"): + from transformers.models.blenderbot_small.modeling_blenderbot_small import BlenderbotSmallAttention - from transformers.models.blenderbot_small.modeling_blenderbot_small import BlenderbotSmallAttention - - modulewise_unpatch(self._model, BlenderbotSmallAttention) + modulewise_unpatch(self._model, BlenderbotSmallAttention) class PegasusModelPatcher(OVSeq2SeqModelPatcher): def __enter__(self): super().__enter__() + if is_transformers_version("<", "4.56"): + from transformers.models.pegasus.modeling_pegasus import PegasusAttention - from transformers.models.pegasus.modeling_pegasus import PegasusAttention - - modulewise_patch(self._model, PegasusAttention, _blenderbot_attn_forward) + modulewise_patch(self._model, PegasusAttention, _blenderbot_attn_forward) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) + if is_transformers_version("<", "4.56"): + from transformers.models.pegasus.modeling_pegasus import PegasusAttention - from transformers.models.pegasus.modeling_pegasus import PegasusAttention - - modulewise_unpatch(self._model, PegasusAttention) + modulewise_unpatch(self._model, PegasusAttention) # Copied from https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/qwen2_moe/modeling_qwen2_moe.py#L596 @@ -5206,14 +5210,14 @@ def __exit__(self, exc_type, exc_value, traceback): class MarianModelPatcher(OVSeq2SeqModelPatcher): def __enter__(self): super().__enter__() - if is_transformers_version(">=", "4.49.0"): + if is_transformers_version(">=", "4.49.0") and is_transformers_version("<", "4.56"): from transformers.models.marian.modeling_marian import MarianAttention modulewise_patch(self._model, MarianAttention, _blenderbot_attn_forward) def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) - if is_transformers_version(">=", "4.49.0"): + if is_transformers_version(">=", "4.49.0") and is_transformers_version("<", "4.56"): from transformers.models.marian.modeling_marian import MarianAttention modulewise_unpatch(self._model, MarianAttention) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index cfd918ff4a..105e67f16b 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -39,7 +39,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "blenderbot", "blenderbot-small", "bloom", - "chatglm", "codegen", "codegen2", "gpt2", @@ -68,7 +67,6 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): "xverse", "internlm", "jais", - "chatglm4", "decilm", "gemma", "olmo", @@ -133,7 +131,7 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("gpt_oss", "gpt_oss_mxfp4") if is_transformers_version("<", "4.56.0"): - SUPPORTED_ARCHITECTURES += ("qwen",) + SUPPORTED_ARCHITECTURES += ("qwen", "chatglm", "chatglm4") GENERATION_LENGTH = 100 REMOTE_CODE_MODELS = ( diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index ff944472bb..dbcd7a79b4 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -235,7 +235,7 @@ def test_load_from_hub_and_save_visual_language_model(self): if is_transformers_version(">=", "4.51"): model_ids.append("katuni4ka/phi-4-multimodal-ov") for model_id in model_ids: - processor = get_preprocessor(model_id) + processor = AutoProcessor.from_pretrained(model_id) prompt = "What is shown in this image?" image = Image.open( requests.get( From bc57cecf7225cc4f647cb9d2b4d416c47fe01769 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 00:06:10 +0100 Subject: [PATCH 21/44] limit awq --- tests/openvino/test_decoder.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 105e67f16b..5cc0515dcf 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -105,7 +105,11 @@ class OVModelForCausalLMIntegrationTest(unittest.TestCase): SUPPORTED_ARCHITECTURES += ("opt_gptq",) # autoawq install disabled for windows test environment - if is_openvino_version(">=", "2024.6.0") and platform.system() != "Windows": + if ( + platform.system() != "Windows" + and is_openvino_version(">=", "2024.6.0") + and is_transformers_version("<", "4.56.0") + ): SUPPORTED_ARCHITECTURES += ("mixtral_awq",) if is_transformers_version(">", "4.49"): From 6489d7e2f1e8b71801a651b05f9e0da89d89d000 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 00:09:13 +0100 Subject: [PATCH 22/44] fix dynamic layer in optimum-onnx's model patcher --- optimum/exporters/openvino/model_configs.py | 31 --------------------- optimum/exporters/openvino/model_patcher.py | 16 ----------- 2 files changed, 47 deletions(-) diff --git a/optimum/exporters/openvino/model_configs.py b/optimum/exporters/openvino/model_configs.py index 430481dd8f..64d5fc98cc 100644 --- a/optimum/exporters/openvino/model_configs.py +++ b/optimum/exporters/openvino/model_configs.py @@ -33,7 +33,6 @@ FalconOnnxConfig, GemmaOnnxConfig, GPT2OnnxConfig, - GPTBigCodeOnnxConfig, GPTJOnnxConfig, GPTNeoOnnxConfig, GPTNeoXOnnxConfig, @@ -42,7 +41,6 @@ MarianOnnxConfig, MistralOnnxConfig, MPTOnnxConfig, - OPTOnnxConfig, PegasusOnnxConfig, PhiOnnxConfig, SpeechT5OnnxConfig, @@ -905,35 +903,6 @@ class BioGPTOpenVINOConfig( _MODEL_PATCHER = OVDecoderModelPatcher -@register_in_tasks_manager( - "gpt_bigcode", - *[ - "feature-extraction", - "feature-extraction-with-past", - "text-generation", - "text-generation-with-past", - "text-classification", - ], -) -class GPTBigCodeOpenVINOConfig(GPTBigCodeOnnxConfig): - _MODEL_PATCHER = OVDecoderModelPatcher - - -@register_in_tasks_manager( - "opt", - *[ - "feature-extraction", - "feature-extraction-with-past", - "text-generation", - "text-generation-with-past", - "text-classification", - ], - library_name="transformers", -) -class OPTOpenVINOConfig(OPTOnnxConfig): - _MODEL_PATCHER = OVDecoderModelPatcher - - @register_in_tasks_manager( "gpt_neo", *[ diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index f51c5ccefe..c36e98592b 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -41,8 +41,6 @@ if is_transformers_version(">=", "4.53"): from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock -if is_transformers_version(">=", "4.56"): - from transformers.cache_utils import DynamicLayer if TYPE_CHECKING: @@ -250,10 +248,6 @@ def __enter__(self): # non-stateful models on cpu and stateful models on npu ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap) - if is_transformers_version(">=", "4.56"): - self.original_dynamic_layer_update = DynamicLayer.update - DynamicLayer.update = patched_dynamic_layer_update - def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -265,9 +259,6 @@ def __exit__(self, exc_type, exc_value, traceback): ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask) ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) - if is_transformers_version(">=", "4.56"): - DynamicLayer.update = self.original_dynamic_layer_update - def _mixtral_sparse_moe_block_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: batch_size, sequence_length, hidden_dim = hidden_states.shape @@ -4456,10 +4447,6 @@ def __enter__(self): # non-stateful models on cpu and stateful models on npu ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", eager_mask_without_vmap) - if is_transformers_version(">=", "4.56"): - self.original_dynamic_layer_update = DynamicLayer.update - DynamicLayer.update = patched_dynamic_layer_update - def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) @@ -4467,9 +4454,6 @@ def __exit__(self, exc_type, exc_value, traceback): ALL_MASK_ATTENTION_FUNCTIONS.register("sdpa", sdpa_mask) ALL_MASK_ATTENTION_FUNCTIONS.register("eager", eager_mask) - if is_transformers_version(">=", "4.56"): - DynamicLayer.update = self.original_dynamic_layer_update - class SanaTextEncoderModelPatcher(ModelPatcher): def __enter__(self): From 11b5a5a9b23200a53b445c5f4af7a71aa3422107 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 00:10:47 +0100 Subject: [PATCH 23/44] remove --- optimum/exporters/openvino/model_patcher.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index c36e98592b..db50c4dd27 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -208,24 +208,6 @@ def eager_mask_without_vmap(*args, **kwargs) -> Optional[torch.Tensor]: return mask -def patched_dynamic_layer_update( - self, - key_states: torch.Tensor, - value_states: torch.Tensor, - cache_kwargs: Optional[dict[str, Any]] = None, -) -> tuple[torch.Tensor, torch.Tensor]: - if self.keys is None: - self.keys = key_states - self.values = value_states - self.device = key_states.device - self.dtype = key_states.dtype - self.is_initialized = True - else: - self.keys = torch.cat([self.keys, key_states], dim=-2) - self.values = torch.cat([self.values, value_states], dim=-2) - return self.keys, self.values - - class OVDecoderModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() From d6cd7a60ba9876e9a9718f12566d8aed9db5fc9b Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 10:42:35 +0100 Subject: [PATCH 24/44] fix donut --- tests/openvino/utils_tests.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/utils_tests.py b/tests/openvino/utils_tests.py index 5f28aa6bc4..15846c3cc2 100644 --- a/tests/openvino/utils_tests.py +++ b/tests/openvino/utils_tests.py @@ -73,7 +73,7 @@ "convnextv2": "optimum-intel-internal-testing/tiny-random-ConvNextV2Model", "distilbert": "optimum-intel-internal-testing/tiny-random-distilbert", "distilbert-ov": "optimum-intel-internal-testing/ov-tiny-random-distilbert", - "donut": "optimum-intel-internal-testing/tiny-doc-qa-vision-encoder-decoder", + "donut": "optimum-internal-testing/tiny-random-VisionEncoderDecoderModel-donut", "donut-swin": "optimum-intel-internal-testing/tiny-random-DonutSwinModel", "detr": "optimum-intel-internal-testing/tiny-random-DetrModel", "electra": "optimum-intel-internal-testing/tiny-random-electra", From 272a624dbeea05bee98409c759092b68e6b3e18a Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 11:57:26 +0100 Subject: [PATCH 25/44] vlm fixes --- optimum/exporters/openvino/model_patcher.py | 77 ++++++++----------- .../openvino/modeling_visual_language.py | 2 + 2 files changed, 33 insertions(+), 46 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index db50c4dd27..4abe9b89da 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -3452,7 +3452,7 @@ def _minicpmv_siglip_transformer_forward( patch_attention_mask = patch_attention_mask.view(batch_size, -1) attention_mask = ( _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) - if not self._use_flash_attention_2 + if not getattr(self, "_use_flash_attention_2", False) else patch_attention_mask ) @@ -4352,15 +4352,6 @@ def __init__( ): super().__init__(config, model, model_kwargs) - # sometimes the use_cache is not properly set in the model config - if self.real_config.use_past: - if hasattr(model.config, "decoder"): - model.config.decoder.use_cache = True - if hasattr(model.config, "text_config"): - model.config.text_config.use_cache = True - if model.config.model_type == "vision-encoder-decoder" and model.config.decoder.model_type == "trocr": - model.decoder.model.decoder.config.use_cache = True - # re-use the patched forward method from the parent class self.super_patched_forward = self.patched_forward @@ -4370,49 +4361,38 @@ def patched_forward(*args, **kwargs): args, kwargs = override_arguments(args, kwargs, signature, model_kwargs=self.model_kwargs) # with statful decoder, we always return the self attn only, cross attn is part of the state - pkv = None if ( getattr(self.real_config, "stateful", False) and self.real_config._behavior == "decoder" and "past_key_values" in signature.parameters ): - pkv_argument_index = list(signature.parameters.keys()).index("past_key_values") + pkv = None + pkv_arg_index = list(signature.parameters.keys()).index("past_key_values") if "past_key_values" in kwargs: pkv = kwargs["past_key_values"] - elif len(args) > pkv_argument_index: - pkv = args[pkv_argument_index] - - if isinstance(pkv, EncoderDecoderCache): - pkv = pkv.to_legacy_cache() + elif len(args) > pkv_arg_index: + pkv = args[pkv_arg_index] if pkv is not None: - self_attn = [cache_item[:2] for cache_item in pkv] - pkv = EncoderDecoderCache.from_legacy_cache(self_attn) + if isinstance(pkv, EncoderDecoderCache): + pkv = pkv.self_attention_cache.to_legacy_cache() + else: + pkv = [pkv_item[:2] for pkv_item in pkv] + pkv = EncoderDecoderCache.from_legacy_cache(pkv) if "past_key_values" in kwargs: kwargs["past_key_values"] = pkv - elif len(args) > pkv_argument_index: - args[pkv_argument_index] = pkv + elif len(args) > pkv_arg_index: + args[pkv_arg_index] = pkv outputs = self.super_patched_forward(*args, **kwargs) # the optimum-onnx seq2seq model patcher only converts to tuple starting from 4.48 - if isinstance(outputs.get("past_key_values"), EncoderDecoderCache): + if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() - # we still need to filter out cross attention in the case of non-stateful decoder - filtered_outputs = {} - for name, value in outputs.items(): - if ( - self.real_config._behavior == "decoder" - and self.real_config.use_past_in_inputs - and name.startswith("past_key_values") - ): - filtered_outputs[name] = tuple([v[:2] for v in value]) - else: - filtered_outputs[name] = value - return filtered_outputs + return outputs self.patched_forward = patched_forward @@ -4707,15 +4687,12 @@ def transformer_forward( # avoiding passing the attention_mask, which is equivalent to attending to the full sequence if not torch.any(~patch_attention_mask): patch_attention_mask = None - elif not self._use_flash_attention_2: + elif not getattr(self, "_use_flash_attention_2", False): patch_attention_mask = _prepare_4d_attention_mask(patch_attention_mask, hidden_states.dtype) encoder_outputs = self.encoder( inputs_embeds=hidden_states, attention_mask=patch_attention_mask, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] @@ -5985,11 +5962,17 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.0/src/transformers/models/llama4/modeling_llama4.py#L1732-L1741 def get_image_embeddings(self, pixel_values): - image_features = self.get_image_features( - pixel_values=pixel_values, - vision_feature_layer=self.config.vision_config.vision_feature_layer, - vision_feature_select_strategy=self.config.vision_config.vision_feature_select_strategy, - ) + if is_transformers_version("<", "4.56"): + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_layer=self.config.vision_config.vision_feature_layer, + vision_feature_select_strategy=self.config.vision_config.vision_feature_select_strategy, + ) + else: + image_features = self.get_image_features( + pixel_values=pixel_values, + vision_feature_select_strategy=self.config.vision_config.vision_feature_select_strategy, + ) vision_flat = image_features.view(-1, image_features.size(-1)) projected_vision_flat = self.multi_modal_projector(vision_flat) return projected_vision_flat @@ -6054,12 +6037,14 @@ def llama4_attn_forward( hidden_states: torch.Tensor, position_embeddings: Tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor], - past_key_value=None, + past_key_value: Optional[tuple[tuple[torch.Tensor]]] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: from transformers.models.llama4.modeling_llama4 import ALL_ATTENTION_FUNCTIONS, eager_attention_forward + past_key_value = past_key_value or kwargs.get("past_key_values") + input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) @@ -6080,7 +6065,7 @@ def llama4_attn_forward( # Use temperature tuning from https://arxiv.org/abs/2501.19399) to NoROPE layers if self.attn_temperature_tuning and not self.use_rope: attn_scales = ( - torch.log(torch.floor((cache_position.float() + 1.0) / self.floor_scale) + 1.0) * self.attn_scale + 1.0 + torch.log1p(torch.floor((cache_position.float() + 1.0) / self.floor_scale)) * self.attn_scale + 1.0 ) attn_scales = attn_scales.view((1, input_shape[-1], 1, 1)).expand((*input_shape, 1, 1)) # batch size > 1 query_states = (query_states * attn_scales).to(query_states.dtype) @@ -6144,7 +6129,7 @@ def llama4_moe_forward(self, hidden_states): index=router_indices, ).to(hidden_states.device) # we gather inputs corresponding to each expert based on the router indices - routed_in = routed_in * router_scores.reshape(-1, 1) + routed_in = routed_in * router_scores.transpose(0, 1).reshape(-1, 1) routed_out = self.experts(routed_in) out = self.shared_expert(hidden_states) # now that we finished expert computation -> we scatter add because we gathered previously diff --git a/optimum/intel/openvino/modeling_visual_language.py b/optimum/intel/openvino/modeling_visual_language.py index aa2de36c9c..56f13bd821 100644 --- a/optimum/intel/openvino/modeling_visual_language.py +++ b/optimum/intel/openvino/modeling_visual_language.py @@ -4434,7 +4434,9 @@ def preprocess_inputs( "phi3_v": _OVPhi3VisionForCausalLM, "internvl_chat": _OVInternVLForCausalLM, "qwen2_vl": _OVQwen2VLForCausalLM, + "qwen2_vl_text": _OVQwen2VLForCausalLM, "qwen2_5_vl": _OVQwen2_5_VLForCausalLM, + "qwen2_5_vl_text": _OVQwen2_5_VLForCausalLM, "got_ocr2": _OVGotOCR2ForCausalLM, "gemma3": _OVGemma3ForCausalLM, "idefics3": _OVIdefics3ForCausalLM, From c62546e143240fd72efba2a1f9b0667461126eba Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 12:18:50 +0100 Subject: [PATCH 26/44] fix speecht5 --- optimum/exporters/openvino/model_patcher.py | 22 +++++++-------------- 1 file changed, 7 insertions(+), 15 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 4abe9b89da..6f740a981a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -5400,7 +5400,7 @@ def speecht5_decoder_layer_forward( class OVSpeechT5ModelPatcher(ModelPatcher): def __enter__(self): if self.real_config._behavior != "vocoder": - setattr(self._model, self.orig_forward_name, self.patched_forward) + super().__enter__() if self.real_config._behavior == "decoder": self._model.speecht5.decoder.prenet.__orig_forward = self._model.speecht5.decoder.prenet.forward self._model.speecht5.decoder.prenet.forward = types.MethodType( @@ -5415,7 +5415,7 @@ def __enter__(self): def __exit__(self, exc_type, exc_value, traceback): if self.real_config._behavior != "vocoder": - setattr(self._model, self.orig_forward_name, self.orig_forward) + super().__exit__(exc_type, exc_value, traceback) if self.real_config._behavior == "decoder": self._model.speecht5.decoder.prenet.forward = types.MethodType( self._model.speecht5.decoder.prenet.__orig_forward, self._model.speecht5.decoder.prenet @@ -5464,12 +5464,9 @@ def patched_decoder_forward( encoder_attention_mask=None, past_key_values=None, ): - return_legacy_cache = False - if past_key_values is not None: - only_self_cache = [cache_item[:2] for cache_item in past_key_values] - past_key_values = only_self_cache - return_legacy_cache = True + past_key_values = [cache_item[:2] for cache_item in past_key_values] + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) output_sequence = inputs_embeds output_cross_attentions = False @@ -5480,7 +5477,6 @@ def patched_decoder_forward( # Run the decoder layers on the last element of the prenet output. decoder_out = model.speecht5.decoder.wrapped_decoder( hidden_states=decoder_hidden_states[:, -1:], - attention_mask=None, encoder_hidden_states=encoder_hidden_states[0], encoder_attention_mask=encoder_attention_mask, past_key_values=past_key_values, @@ -5488,10 +5484,6 @@ def patched_decoder_forward( output_attentions=output_cross_attentions, return_dict=True, ) - - # if output_cross_attentions: - # cross_attentions.append(torch.cat(decoder_out.cross_attentions, dim=0)) - last_decoder_output = decoder_out.last_hidden_state.squeeze(1) # Predict the new mel spectrum for this step in the sequence. @@ -5504,9 +5496,9 @@ def patched_decoder_forward( # Predict the probability that this is the stop token. prob = torch.sigmoid(model.speech_decoder_postnet.prob_out(last_decoder_output)) - if return_legacy_cache: - only_self_cache = [cache_item[:2] for cache_item in decoder_out.past_key_values] - past_key_values = only_self_cache + past_key_values = decoder_out.past_key_values + if past_key_values is not None: + past_key_values = past_key_values.self_attention_cache.to_legacy_cache() result = { "output_sequence_out": output_sequence_out, From a7ede39624634696e043f2aba1f76f179a7cc7ec Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 12:22:10 +0100 Subject: [PATCH 27/44] fix whisper --- optimum/exporters/openvino/model_patcher.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 6f740a981a..dfba9fbf18 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -4392,7 +4392,18 @@ def patched_forward(*args, **kwargs): if isinstance(outputs.get("past_key_values"), (DynamicCache, EncoderDecoderCache)): outputs["past_key_values"] = outputs["past_key_values"].to_legacy_cache() - return outputs + # we still need to filter out cross attention in the case of non-stateful decoder + filtered_outputs = {} + for name, value in outputs.items(): + if ( + self.real_config._behavior == "decoder" + and self.real_config.use_past_in_inputs + and name.startswith("past_key_values") + ): + filtered_outputs[name] = tuple([v[:2] for v in value]) + else: + filtered_outputs[name] = value + return filtered_outputs self.patched_forward = patched_forward From 817bc54075b014ad63480521cc2acfe000c6a700 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 12:49:24 +0100 Subject: [PATCH 28/44] fix --- optimum/exporters/openvino/model_patcher.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index dfba9fbf18..1e46e2747a 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -5477,7 +5477,8 @@ def patched_decoder_forward( ): if past_key_values is not None: past_key_values = [cache_item[:2] for cache_item in past_key_values] - past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) + if is_transformers_version(">=", "4.56"): + past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values) output_sequence = inputs_embeds output_cross_attentions = False @@ -5509,7 +5510,10 @@ def patched_decoder_forward( past_key_values = decoder_out.past_key_values if past_key_values is not None: - past_key_values = past_key_values.self_attention_cache.to_legacy_cache() + if isinstance(past_key_values, EncoderDecoderCache): + past_key_values = past_key_values.self_attention_cache.to_legacy_cache() + else: + past_key_values = [cache_item[:2] for cache_item in past_key_values] result = { "output_sequence_out": output_sequence_out, From 225b81d02dd5aa4e55a07de0ad745022cdb86bc1 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 15:20:39 +0100 Subject: [PATCH 29/44] fix qwenvl --- optimum/exporters/openvino/__main__.py | 5 +++++ optimum/exporters/openvino/utils.py | 24 ++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index aa63a0794b..4519447042 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -45,6 +45,7 @@ clear_class_registry, deduce_diffusers_dtype, load_preprocessors, + patch_qwenvl_configs, ) @@ -280,6 +281,10 @@ def main_export( do_bitnet_patching = quant_method == "bitnet" model_type = config.model_type + if model_type.startswith("qwen2") and model_type.endswith("vl_text"): + patch_qwenvl_configs() + model_type = config.model_type + if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True if custom_export_configs is None: diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 126eb28284..9b3b9f5796 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -405,3 +405,27 @@ def load_preprocessors( except Exception: pass return preprocessors + + +def patch_qwenvl_configs(): + from transformers import Qwen2_5_VLConfig, Qwen2VLConfig + + original_getattribute = Qwen2VLConfig.__getattribute__ + + def model_type_preserving_getattribute(self, name): + if name == "model_type": + return "qwen2_vl" + else: + return original_getattribute(self, name) + + Qwen2VLConfig.__getattribute__ = model_type_preserving_getattribute + + original_getattribute = Qwen2_5_VLConfig.__getattribute__ + + def model_type_preserving_getattribute(self, name): + if name == "model_type": + return "qwen2_5_vl" + else: + return original_getattribute(self, name) + + Qwen2_5_VLConfig.__getattribute__ = model_type_preserving_getattribute From 7c5c92c4c15ad75ae4da8765268cec5b693971fe Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 15:23:13 +0100 Subject: [PATCH 30/44] better fix --- optimum/exporters/openvino/__main__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 4519447042..179d78589b 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -281,7 +281,8 @@ def main_export( do_bitnet_patching = quant_method == "bitnet" model_type = config.model_type - if model_type.startswith("qwen2") and model_type.endswith("vl_text"): + + if is_transformers_version(">=", "4.56") and model_type in {"qwen2_vl", "qwen2_5_vl"}: patch_qwenvl_configs() model_type = config.model_type From 911626235f57b5d918de5a4eef4e966e7fa5a892 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Fri, 28 Nov 2025 15:29:33 +0100 Subject: [PATCH 31/44] fix recursion issue --- optimum/exporters/openvino/__main__.py | 6 ++---- optimum/exporters/openvino/utils.py | 16 ++++++++-------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/optimum/exporters/openvino/__main__.py b/optimum/exporters/openvino/__main__.py index 179d78589b..7bfaafb5ea 100644 --- a/optimum/exporters/openvino/__main__.py +++ b/optimum/exporters/openvino/__main__.py @@ -280,12 +280,10 @@ def main_export( do_gptq_patching = quant_method == "gptq" do_bitnet_patching = quant_method == "bitnet" - model_type = config.model_type - - if is_transformers_version(">=", "4.56") and model_type in {"qwen2_vl", "qwen2_5_vl"}: + if is_transformers_version(">=", "4.56") and config.model_type in {"qwen2_vl_text", "qwen2_5_vl_text"}: patch_qwenvl_configs() - model_type = config.model_type + model_type = config.model_type if model_type not in TasksManager._SUPPORTED_MODEL_TYPE: custom_architecture = True if custom_export_configs is None: diff --git a/optimum/exporters/openvino/utils.py b/optimum/exporters/openvino/utils.py index 9b3b9f5796..4e5ecf17fe 100644 --- a/optimum/exporters/openvino/utils.py +++ b/optimum/exporters/openvino/utils.py @@ -410,22 +410,22 @@ def load_preprocessors( def patch_qwenvl_configs(): from transformers import Qwen2_5_VLConfig, Qwen2VLConfig - original_getattribute = Qwen2VLConfig.__getattribute__ + original_getattribute_2 = Qwen2VLConfig.__getattribute__ - def model_type_preserving_getattribute(self, name): + def model_type_preserving_getattribute_2(self, name): if name == "model_type": return "qwen2_vl" else: - return original_getattribute(self, name) + return original_getattribute_2(self, name) - Qwen2VLConfig.__getattribute__ = model_type_preserving_getattribute + Qwen2VLConfig.__getattribute__ = model_type_preserving_getattribute_2 - original_getattribute = Qwen2_5_VLConfig.__getattribute__ + original_getattribute_25 = Qwen2_5_VLConfig.__getattribute__ - def model_type_preserving_getattribute(self, name): + def model_type_preserving_getattribute_25(self, name): if name == "model_type": return "qwen2_5_vl" else: - return original_getattribute(self, name) + return original_getattribute_25(self, name) - Qwen2_5_VLConfig.__getattribute__ = model_type_preserving_getattribute + Qwen2_5_VLConfig.__getattribute__ = model_type_preserving_getattribute_25 From f4591a72e73e8cf0a435599f873d15eafc488d9f Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 08:11:18 +0100 Subject: [PATCH 32/44] fix llama4 and quantization --- optimum/exporters/openvino/model_patcher.py | 24 +++++++++++++++------ tests/openvino/test_exporters_cli.py | 5 ++++- tests/openvino/test_quantization.py | 13 ++++++++--- 3 files changed, 32 insertions(+), 10 deletions(-) diff --git a/optimum/exporters/openvino/model_patcher.py b/optimum/exporters/openvino/model_patcher.py index 1e46e2747a..84a17f8ce0 100644 --- a/optimum/exporters/openvino/model_patcher.py +++ b/optimum/exporters/openvino/model_patcher.py @@ -41,6 +41,8 @@ if is_transformers_version(">=", "4.53"): from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, eager_mask, sdpa_mask from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock +if is_transformers_version(">=", "4.56"): + import transformers.masking_utils if TYPE_CHECKING: @@ -5969,7 +5971,7 @@ def __init__( # Adopted from https://github.com/huggingface/transformers/blob/v4.51.0/src/transformers/models/llama4/modeling_llama4.py#L1732-L1741 def get_image_embeddings(self, pixel_values): - if is_transformers_version("<", "4.56"): + if is_transformers_version("<", "4.57"): image_features = self.get_image_features( pixel_values=pixel_values, vision_feature_layer=self.config.vision_config.vision_feature_layer, @@ -6045,12 +6047,13 @@ def llama4_attn_forward( position_embeddings: Tuple[torch.Tensor, torch.Tensor], attention_mask: Optional[torch.Tensor], past_key_value: Optional[tuple[tuple[torch.Tensor]]] = None, + past_key_values: Optional[tuple[tuple[torch.Tensor]]] = None, cache_position: Optional[torch.LongTensor] = None, **kwargs, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: from transformers.models.llama4.modeling_llama4 import ALL_ATTENTION_FUNCTIONS, eager_attention_forward - past_key_value = past_key_value or kwargs.get("past_key_values") + past_key_value = past_key_value or past_key_values input_shape = hidden_states.shape[:-1] hidden_shape = (*input_shape, -1, self.head_dim) @@ -6087,10 +6090,7 @@ def llama4_attn_forward( attention_interface = eager_attention_forward if self.config._attn_implementation != "eager": - if self.config._attn_implementation == "sdpa" and kwargs.get("output_attentions", False): - attention_interface = eager_attention_forward - else: - attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] attn_output, attn_weights = attention_interface( self, @@ -6149,6 +6149,7 @@ def llama4_moe_forward(self, hidden_states): class Llama4TextModelPatcher(ModelPatcher): def __enter__(self): super().__enter__() + self._model.model.rotary_emb._orig_forward = self._model.model.rotary_emb.forward self._model.model.rotary_emb.forward = types.MethodType(llama4_rope_forward, self._model.model.rotary_emb) for layer in self._model.model.layers[: self._model.model.config.num_hidden_layers]: @@ -6158,14 +6159,25 @@ def __enter__(self): layer.self_attn._orig_forward = layer.self_attn.forward layer.self_attn.forward = types.MethodType(llama4_attn_forward, layer.self_attn) + if is_transformers_version(">=", "4.56"): + # openvino is not able to trace through the new chunked_overlay with left_padding + self.original_chunked_overlay = transformers.masking_utils.chunked_overlay + transformers.masking_utils.chunked_overlay = ( + lambda chunk_size, left_padding: transformers.masking_utils._legacy_chunked_overlay(chunk_size) + ) + def __exit__(self, exc_type, exc_value, traceback): super().__exit__(exc_type, exc_value, traceback) + self._model.model.rotary_emb.forward = self._model.model.rotary_emb._orig_forward for layer in self._model.model.layers[: self._model.model.config.num_hidden_layers]: if layer.is_moe_layer and is_transformers_version("<", "4.54"): layer.feed_forward.forward = layer.feed_forward._orig_forward layer.self_attn.forward = layer.self_attn._orig_forward + if is_transformers_version(">=", "4.56"): + transformers.masking_utils.chunked_overlay = self.original_chunked_overlay + # Vectorized implementation of ConvSequenceTransform to avoid if-else branching class ConvSequenceTransform(torch.nn.Module): diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 68712a8d07..4b867295fd 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -729,7 +729,10 @@ class OVCLIExportTestCase(unittest.TestCase): "int4 --group-size 16 --ratio 0.8 --dataset contextual --num-samples 1 " '--sensitivity-metric "mean_activation_magnitude"', { - "lm_model": {"int8": 46, "int4": 56}, + "lm_model": { + "int8": 46 if is_transformers_version("<", "4.57") else 48, + "int4": 56 if is_transformers_version("<", "4.57") else 54, + }, "text_embeddings_model": {"int8": 1}, "vision_embeddings_model": {"int8": 16}, }, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 17f1ee2122..303f68fa48 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -590,9 +590,13 @@ class OVWeightCompressionTest(unittest.TestCase): bits=4, sym=False, group_size=32, - ignored_scope={"names": ["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul"]}, + ignored_scope={ + "names": ["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul"] + if is_transformers_version("<", "4.57") + else [] + }, ), - {"model": {"int8": 4, "int4": 38}}, + {"model": {"int8": 4, "int4": 38 if is_transformers_version("<", "4.57") else 40}}, ), ( OVModelForCausalLM, @@ -906,7 +910,10 @@ class OVWeightCompressionTest(unittest.TestCase): num_samples=1, ), { - "lm_model": {"int8": 46, "int4": 56}, + "lm_model": { + "int8": 46 if is_transformers_version("<", "4.57") else 48, + "int4": 56 if is_transformers_version("<", "4.57") else 54, + }, "text_embeddings_model": {"int8": 1}, "vision_embeddings_model": {"int8": 16}, }, From a5029bd9f5a4ecdd5e6944f54a8ba811c6137e96 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 08:24:11 +0100 Subject: [PATCH 33/44] fix setup --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index c4a294ee40..10fc9c5c22 100644 --- a/setup.py +++ b/setup.py @@ -28,7 +28,7 @@ INSTALL_REQUIRE = [ "torch>=2.1", - "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@transformers-4.57", + "optimum-onnx@git+https://github.com/huggingface/optimum-onnx.git@main", "transformers>=4.45,<4.58", "setuptools", ] From d1449c61a44b4b85c1062c6e1f0f2a3107ba09da Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 08:40:40 +0100 Subject: [PATCH 34/44] fix gemma3 and skip grouped beam search --- tests/openvino/test_decoder.py | 4 +++- tests/openvino/test_seq2seq.py | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 5cc0515dcf..2ad988b9b7 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -654,9 +654,11 @@ def test_beam_search(self, model_arch): gen_configs = [ beam_search_gen_config, beam_sample_gen_config, - group_beam_search_gen_config, constrained_beam_search_gen_config, ] + if is_transformers_version("<", "4.57.0"): + gen_configs.append(group_beam_search_gen_config) + set_seed(SEED) ov_model_stateful = OVModelForCausalLM.from_pretrained( model_id, export=True, use_cache=True, stateful=True, device=OPENVINO_DEVICE, **model_kwargs diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index beb17b1389..25cde72357 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -598,6 +598,9 @@ def test_compare_to_transformers(self, model_arch): f"but found counts: {bos_token_counts.tolist()}", ) + if is_transformers_version(">=", "4.57.0"): + inputs.pop("token_type_ids") + transformers_inputs = copy.deepcopy(inputs) # llama4 preprocessing force bf16 dtype for pixel_values, that does not work on CPU with fp32 model # if past key values are not initialized, llama4 creates HybridCache with bf16 precision @@ -660,7 +663,7 @@ def test_compare_to_transformers(self, model_arch): transformers_model.generation_config.cache_implementation = None from transformers.cache_utils import DynamicCache - additional_inputs = {"past_key_values": DynamicCache()} + additional_inputs = {"past_key_values": DynamicCache(config=transformers_model.config)} if model_arch == "llama4": transformers_inputs["past_key_values"] = DynamicCache() From a67941134694f39f142c6b98bfd8868d9bb50ff3 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 08:44:05 +0100 Subject: [PATCH 35/44] fix --- tests/openvino/test_seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_seq2seq.py b/tests/openvino/test_seq2seq.py index 25cde72357..3a074d30e9 100644 --- a/tests/openvino/test_seq2seq.py +++ b/tests/openvino/test_seq2seq.py @@ -663,7 +663,7 @@ def test_compare_to_transformers(self, model_arch): transformers_model.generation_config.cache_implementation = None from transformers.cache_utils import DynamicCache - additional_inputs = {"past_key_values": DynamicCache(config=transformers_model.config)} + additional_inputs = {"past_key_values": DynamicCache()} if model_arch == "llama4": transformers_inputs["past_key_values"] = DynamicCache() From 25d2f66762ff388bd97a3c5994a9f973e62e8569 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 09:17:09 +0100 Subject: [PATCH 36/44] fix quants --- tests/openvino/test_exporters_cli.py | 5 +---- tests/openvino/test_quantization.py | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 4b867295fd..68712a8d07 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -729,10 +729,7 @@ class OVCLIExportTestCase(unittest.TestCase): "int4 --group-size 16 --ratio 0.8 --dataset contextual --num-samples 1 " '--sensitivity-metric "mean_activation_magnitude"', { - "lm_model": { - "int8": 46 if is_transformers_version("<", "4.57") else 48, - "int4": 56 if is_transformers_version("<", "4.57") else 54, - }, + "lm_model": {"int8": 46, "int4": 56}, "text_embeddings_model": {"int8": 1}, "vision_embeddings_model": {"int8": 16}, }, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 303f68fa48..f9ba2ce35e 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -910,10 +910,7 @@ class OVWeightCompressionTest(unittest.TestCase): num_samples=1, ), { - "lm_model": { - "int8": 46 if is_transformers_version("<", "4.57") else 48, - "int4": 56 if is_transformers_version("<", "4.57") else 54, - }, + "lm_model": {"int8": 46, "int4": 56}, "text_embeddings_model": {"int8": 1}, "vision_embeddings_model": {"int8": 16}, }, From bfcf961ddb9ab52fcb33eacbdb0e66de707adc62 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 10:43:39 +0100 Subject: [PATCH 37/44] fix --- tests/openvino/test_decoder.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/openvino/test_decoder.py b/tests/openvino/test_decoder.py index 2ad988b9b7..b6c77ee10d 100644 --- a/tests/openvino/test_decoder.py +++ b/tests/openvino/test_decoder.py @@ -654,10 +654,12 @@ def test_beam_search(self, model_arch): gen_configs = [ beam_search_gen_config, beam_sample_gen_config, - constrained_beam_search_gen_config, + # group_beam_search_gen_config, + # constrained_beam_search_gen_config, ] if is_transformers_version("<", "4.57.0"): - gen_configs.append(group_beam_search_gen_config) + # currently broken in transformers == 4.57.* + gen_configs.extend([group_beam_search_gen_config, constrained_beam_search_gen_config]) set_seed(SEED) ov_model_stateful = OVModelForCausalLM.from_pretrained( From b714f6d7eed2c05e0551e2941638681d1912c998 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 12:50:49 +0100 Subject: [PATCH 38/44] fix --- tests/openvino/test_quantization.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index f9ba2ce35e..6b6e19d8b6 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -591,12 +591,14 @@ class OVWeightCompressionTest(unittest.TestCase): sym=False, group_size=32, ignored_scope={ - "names": ["__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul"] - if is_transformers_version("<", "4.57") - else [] + "names": [ + "__module.model.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" + if is_transformers_version("<", "4.57") + else "__module.transformer.h.2.mlp.c_fc/aten::addmm/MatMul" + ] }, ), - {"model": {"int8": 4, "int4": 38 if is_transformers_version("<", "4.57") else 40}}, + {"model": {"int8": 4, "int4": 38}}, ), ( OVModelForCausalLM, From 3ca93c8c78c5082ea49c52044a054edc82614de7 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 15:05:51 +0100 Subject: [PATCH 39/44] revert line --- tests/openvino/test_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index dbcd7a79b4..ff944472bb 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -235,7 +235,7 @@ def test_load_from_hub_and_save_visual_language_model(self): if is_transformers_version(">=", "4.51"): model_ids.append("katuni4ka/phi-4-multimodal-ov") for model_id in model_ids: - processor = AutoProcessor.from_pretrained(model_id) + processor = get_preprocessor(model_id) prompt = "What is shown in this image?" image = Image.open( requests.get( From 20250f68315c02bdf0d1de0c4f179b94590ad774 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 15:38:18 +0100 Subject: [PATCH 40/44] test offline on python 3.10 --- .github/workflows/test_offline.yaml | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/.github/workflows/test_offline.yaml b/.github/workflows/test_offline.yaml index 11dbc5fe31..13d3c2d3a6 100644 --- a/.github/workflows/test_offline.yaml +++ b/.github/workflows/test_offline.yaml @@ -14,6 +14,8 @@ concurrency: cancel-in-progress: true env: + UV_TORCH_BACKEND: cpu + UV_SYSTEM_PYTHON: true TRANSFORMERS_IS_CI: true jobs: @@ -27,11 +29,12 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: 3.9 + python-version: "3.10" - name: Install dependencies run: | - pip install .[tests,openvino] + pip install --upgrade pip uv + uv pip install .[openvino,diffusers,tests] - name: Test run: | From e5d2dc65b81504b5d0c512272c5c6990694bef81 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 15:39:18 +0100 Subject: [PATCH 41/44] ov 2025.4.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 10fc9c5c22..5427ca251d 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.18.0"], - "openvino": ["nncf>=2.18.0", "openvino>=2025.1.0", "openvino-tokenizers>=2025.1.0"], + "openvino": ["nncf>=2.18.0", "openvino>=2025.1.0,<2025.4.0", "openvino-tokenizers>=2025.1.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], "diffusers": ["diffusers"], From ad94d8fa6da9c288a316e9228c52175482834896 Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 16:32:52 +0100 Subject: [PATCH 42/44] fix --- tests/openvino/test_modeling.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index ff944472bb..8300f72d83 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -235,7 +235,10 @@ def test_load_from_hub_and_save_visual_language_model(self): if is_transformers_version(">=", "4.51"): model_ids.append("katuni4ka/phi-4-multimodal-ov") for model_id in model_ids: - processor = get_preprocessor(model_id) + if is_transformers_version("<", "4.57"): + processor = get_preprocessor(model_id) + else: + processor = AutoProcessor.from_pretrained(model_id) prompt = "What is shown in this image?" image = Image.open( requests.get( From 99372b87ba68f117885e1dd73e1a21f7bb05f5bf Mon Sep 17 00:00:00 2001 From: IlyasMoutawwakil Date: Mon, 1 Dec 2025 16:56:39 +0100 Subject: [PATCH 43/44] simply skip phi4 --- tests/openvino/test_modeling.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/openvino/test_modeling.py b/tests/openvino/test_modeling.py index 8300f72d83..9756e4b2b2 100644 --- a/tests/openvino/test_modeling.py +++ b/tests/openvino/test_modeling.py @@ -232,13 +232,12 @@ def test_load_from_hub_and_save_decoder_model(self, use_cache): def test_load_from_hub_and_save_visual_language_model(self): model_ids = [self.OV_VLM_MODEL_ID] - if is_transformers_version(">=", "4.51"): + if is_transformers_version(">=", "4.51") and is_transformers_version("<", "4.57"): + # the phi4 auto-processor can't be loaded in offline mode + # anymore due to an internal bug in transformers model_ids.append("katuni4ka/phi-4-multimodal-ov") for model_id in model_ids: - if is_transformers_version("<", "4.57"): - processor = get_preprocessor(model_id) - else: - processor = AutoProcessor.from_pretrained(model_id) + processor = get_preprocessor(model_id) prompt = "What is shown in this image?" image = Image.open( requests.get( From d14416b7eaf4dbd7647103456c9c26697b0b5695 Mon Sep 17 00:00:00 2001 From: Ilyas Moutawwakil <57442720+IlyasMoutawwakil@users.noreply.github.com> Date: Tue, 2 Dec 2025 08:27:32 +0100 Subject: [PATCH 44/44] Apply suggestion from @IlyasMoutawwakil --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 5427ca251d..10fc9c5c22 100644 --- a/setup.py +++ b/setup.py @@ -65,7 +65,7 @@ EXTRAS_REQUIRE = { "nncf": ["nncf>=2.18.0"], - "openvino": ["nncf>=2.18.0", "openvino>=2025.1.0,<2025.4.0", "openvino-tokenizers>=2025.1.0"], + "openvino": ["nncf>=2.18.0", "openvino>=2025.1.0", "openvino-tokenizers>=2025.1.0"], "neural-compressor": ["neural-compressor[pt]>=3.4.1", "accelerate", "transformers<4.46", "datasets"], "ipex": ["intel-extension-for-pytorch>=2.8", "transformers>4.54,<4.56", "accelerate"], "diffusers": ["diffusers"],