From 1cd87efb2e9cc76b46c8654a7125c97b32024a24 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Wed, 22 Oct 2025 05:56:19 +0000 Subject: [PATCH 1/7] Models test config in single Config file Signed-off-by: Abukhoyer Shaik --- QEfficient/utils/constants.py | 4 +- tests/conftest.py | 35 -- .../models/custom_tiny_model_configs.json | 348 ------------ .../models/test_audio_embedding_models.py | 9 +- .../models/test_causal_lm_models.py | 126 ++--- .../models/test_embedding_models.py | 10 +- .../models/test_image_text_to_text_models.py | 350 ++++-------- .../models/test_model_configs.json | 532 ++++++++++++++++++ .../models/test_prefix_caching.py | 9 +- .../models/test_speech_seq2seq_models.py | 9 +- 10 files changed, 703 insertions(+), 729 deletions(-) delete mode 100644 tests/transformers/models/custom_tiny_model_configs.json create mode 100644 tests/transformers/models/test_model_configs.json diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 57fba282b..114a6fc11 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -17,7 +17,7 @@ ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32 ONNX_EXPORT_EXAMPLE_FBS = 4 ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep -ONNX_EXPORT_OPSET = 13 +ONNX_EXPORT_OPSET = 17 ONNX_EXPORT_MAX_NUM_IMAGES = 1 ONNX_EXPORT_MAX_IMAGE_TILES = 4 ONNX_EXPORT_IMAGE_WIDTH = 560 @@ -84,7 +84,7 @@ def get_models_dir(): ONNX_EXPORT_EXAMPLE_MAX_TOP_K_IDS = 512 ONNX_EXPORT_EXAMPLE_TOP_PS = 0.80 ONNX_EXPORT_EXAMPLE_MIN_PS = 0.99 -ONNX_EXPORT_OPSET = 13 +ONNX_EXPORT_OPSET = 17 COMPILER = ["/opt/qti-aic/exec/qaic-exec", "-aic-hw"] DEFAULT_AIC_HW_VERSION = "ai100" diff --git a/tests/conftest.py b/tests/conftest.py index ba0f341fe..051701036 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,46 +5,11 @@ # # ----------------------------------------------------------------------------- -import json import os import shutil -import pytest -from transformers import AutoConfig - from QEfficient.utils.constants import QEFF_MODELS_DIR from QEfficient.utils.logging_utils import logger -from QEfficient.utils.test_utils import ModelConfig - - -def get_custom_model_config_dict(configs): - """ - Converts a list of custom model configuration dictionaries into a dictionary - mapping model names to their corresponding AutoConfig objects. - - Args: - configs (List[Dict]): A list of dictionaries, each containing model configuration parameters. - - Returns: - Dict[str, AutoConfig]: A dictionary where keys are model names and values are AutoConfig objects. - """ - config_dict = {} - for config in configs: - model_name = config["model_name"] - config_dict[model_name] = AutoConfig.from_pretrained( - model_name, - trust_remote_code=config["model_name"] in ModelConfig.EXTERNAL_MODELS, - **config.get("additional_params", {}), - ) - return config_dict - - -# Pytest fixture to load custom model configs from a JSON file -@pytest.fixture(scope="session") -def custom_causal_model_config_dict(): - with open("tests/transformers/models/custom_tiny_model_configs.json", "r") as f: - custom_model_configs_data = json.load(f) - return get_custom_model_config_dict(custom_model_configs_data) def qeff_models_clean_up(): diff --git a/tests/transformers/models/custom_tiny_model_configs.json b/tests/transformers/models/custom_tiny_model_configs.json deleted file mode 100644 index 03a9541fd..000000000 --- a/tests/transformers/models/custom_tiny_model_configs.json +++ /dev/null @@ -1,348 +0,0 @@ -[ - { - "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "gpt2", - "model_type": "gpt2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50257, - "num_key_value_heads": 1 - } - }, - { - "model_name": "allenai/OLMo-2-0425-1B", - "model_type": "olmo2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 100352, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Salesforce/codegen-350M-mono", - "model_type": "codegen", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 4, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 51200, - "num_key_value_heads": 1, - "rotary_dim": 16 - } - }, - - { - "model_name": "microsoft/Phi-3-mini-4k-instruct", - "model_type": "phi3", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32064, - "num_key_value_heads": 1 - } - }, - { - "model_name": "tiiuae/falcon-7b", - "model_type": "falcon", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 65024, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", - "model_type": "qwen3_moe", - "additional_params": { - "hidden_size": 256, - "intermediate_size": 256, - "max_position_embeddings": 128, - "max_window_layers": 48, - "moe_intermediate_size": 768, - "num_attention_heads": 2, - "num_experts": 4, - "num_experts_per_tok": 2, - "num_hidden_layers": 1, - "num_key_value_heads": 1, - "vocab_size": 151936 - } - }, - { - "model_name": "Qwen/Qwen2-0.5B", - "model_type": "qwen2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 151936, - "num_key_value_heads": 1 - } - }, - { - "model_name": "bigcode/starcoder2-3b", - "model_type": "starcoder2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49152, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Felladrin/Minueza-32M-Base", - "model_type": "mistral", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32002, - "num_key_value_heads": 1 - } - }, - { - "model_name": "wtang06/mpt-125m-c4", - "model_type": "mpt", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50368 - } - }, - { - "model_name": "hakurei/gpt-j-random-tinier", - "model_type": "gptj", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 50400, - "num_key_value_heads": 1, - "rotary_dim": 16 - } - }, - { - "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "model_type": "mixtral", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "meta-llama/Llama-3.2-1B", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 128256, - "num_key_value_heads": 1, - "rope_scaling": { - "factor": 32.0, - "high_freq_factor": 4.0, - "low_freq_factor": 1.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3" - } - } - }, - { - "model_name": "unsloth/gemma-2b", - "model_type": "gemma", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 256000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "unsloth/gemma-2-2b", - "model_type": "gemma2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 256000, - "num_key_value_heads": 1 - } - }, - { - "model_name": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32003 - } - }, - { - "model_name": "TheBloke/Llama-2-7B-GPTQ", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 32000 - } - }, - { - "model_name": "ibm-granite/granite-20b-code-base", - "model_type": "gpt_bigcode", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49152, - "num_key_value_heads": 1, - "activation_function": "gelu", - "architectures": [ - "GPTBigCodeForCausalLM" - ] - } - }, - { - "model_name": "neuralmagic/Llama-3.2-3B-Instruct-FP8", - "model_type": "llama", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 128256 - } - }, - { - "model_name": "neuralmagic/Qwen2-0.5B-Instruct-FP8", - "model_type": "qwen2", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 2, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 151936 - } - }, - { - "model_name": "ibm-granite/granite-3.1-2b-instruct", - "model_type": "granite", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49155, - "num_key_value_heads": 1 - } - }, - { - "model_name": "ibm-granite/granite-guardian-3.1-2b", - "model_type": "granite", - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 49155, - "num_key_value_heads": 1 - } - }, - { - "model_name": "hpcai-tech/grok-1", - "model_type": null, - "additional_params":{ - "max_position_embeddings": 128, - "num_hidden_layers": 1, - "num_attention_heads": 2, - "hidden_size": 64, - "intermediate_size": 256, - "vocab_size": 131072, - "num_key_value_heads": 1 - } - }, - { - "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", - "model_type": null, - "additional_params": { - "max_position_embeddings": 128, - "num_hidden_layers": 2, - "num_attention_heads": 2, - "hidden_size": 256, - "intermediate_size": 256, - "vocab_size": 128256, - "num_key_value_layers": 1, - "num_key_value_heads": 1, - "rope_scaling": { - "factor": 8.0, - "high_freq_factor": 4.0, - "low_freq_factor": 1.0, - "original_max_position_embeddings": 8192, - "rope_type": "llama3" - } - } - } -] diff --git a/tests/transformers/models/test_audio_embedding_models.py b/tests/transformers/models/test_audio_embedding_models.py index da30c76b0..75f9fac08 100644 --- a/tests/transformers/models/test_audio_embedding_models.py +++ b/tests/transformers/models/test_audio_embedding_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os from typing import List, Optional @@ -23,9 +24,11 @@ from QEfficient.utils.constants import WAV2VEC2_MAX_SEQ_LEN, QnnConstants from QEfficient.utils.device_utils import get_available_device_id -test_models = [ - "facebook/wav2vec2-base-960h", -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + test_models = config_data["audio_embedding_models"] def load_ctc_model(model_config): diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 86bce4441..81f710f09 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -6,6 +6,7 @@ # ----------------------------------------------------------------------------- import copy +import json import os from typing import Optional @@ -24,47 +25,40 @@ from QEfficient.utils.run_utils import ApiRunner from QEfficient.utils.test_utils import ModelConfig -test_models_causal = [ - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "gpt2", - "Salesforce/codegen-350M-mono", - "microsoft/Phi-3-mini-4k-instruct", - "tiiuae/falcon-7b", - "Qwen/Qwen2-0.5B", - "Qwen/Qwen3-0.6B", - "bigcode/starcoder2-3b", - "Qwen/Qwen3-30B-A3B-Instruct-2507", - "Felladrin/Minueza-32M-Base", - "wtang06/mpt-125m-c4", - "hakurei/gpt-j-random-tinier", - "mistralai/Mixtral-8x7B-Instruct-v0.1", - "meta-llama/Llama-3.2-1B", - "unsloth/gemma-2b", - "unsloth/gemma-2-2b", - "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", # AWQ model - "TheBloke/Llama-2-7B-GPTQ", # GPTQ model - "ibm-granite/granite-20b-code-base", - # "neuralmagic/Meta-Llama-3.1-8B-Instruct-FP8-dynamic", # naive-quantized compressed-tensor FP8 model per-channel weight, per-token activations - "neuralmagic/Llama-3.2-3B-Instruct-FP8", # float quantized compressed-tensor per tensor both weight and activations - "neuralmagic/Qwen2-0.5B-Instruct-FP8", # fp8 quant method, static, with lm head ignored - "ibm-granite/granite-3.1-2b-instruct", - "ibm-granite/granite-guardian-3.1-2b", - "hpcai-tech/grok-1", - "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", - "allenai/OLMo-2-0425-1B", -] - -test_models_qnn = [ - "mistralai/Mixtral-8x7B-Instruct-v0.1", - "meta-llama/Llama-3.2-1B", - "unsloth/gemma-2b", - "ibm-granite/granite-guardian-3.1-2b", -] - -test_models_spd = [ - "TinyLlama/TinyLlama-1.1B-Chat-v1.0", - "Qwen/Qwen2-0.5B", -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + causal_lm_models = config_data["causal_lm_models"] + spd_models = config_data["spd_causal_lm_models"] + qnn_models = config_data["qnn_causal_lm_models"] + + +# Create a list of model names for parameterization +test_models_causal = [model["model_name"] for model in causal_lm_models] +test_models_spd = [model["model_name"] for model in spd_models] +test_models_qnn = [model["model_name"] for model in qnn_models] + +# Create a dictionary mapping model names to their configs +model_config_dict = {model["model_name"]: model for model in causal_lm_models} + + +def get_hf_config_from_custom_config(model_name): + """ + Function to get HF config from custom config file + -------- + :model_name: str + + :return config + """ + custom_config = model_config_dict[model_name] + + hf_config = AutoConfig.from_pretrained( + model_name, + trust_remote_code=model_name in ModelConfig.EXTERNAL_MODELS, + **custom_config.get("additional_params", {}), + ) + return hf_config def get_custom_n_layers(model_name): @@ -101,7 +95,6 @@ def load_causal_lm_model(model_name, n_layer=1, config=None): ) if config is None: # If custom config is not provided, load the model config from Hugging Face if n_layer is not None: - # If n_layer is specified, load the model with that many layers model_hf = AutoModelForCausalLM.from_pretrained( model_path, use_cache=True, @@ -145,7 +138,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, config: Optional[AutoConfig] = None, - pytorch_hf_tokens: Optional[list] = None, ): """ Validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. @@ -172,7 +164,8 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.PROMPT_LEN, Constants.CTX_LEN, ) - if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: + + if model_name not in ModelConfig.SWIFTKV_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) is_tlm = False if num_speculative_tokens is None else True @@ -191,8 +184,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( assert (pytorch_kv_tokens == ort_tokens).all(), "Tokens don't match for ONNXRT output and PyTorch output." - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") qpc_path = qeff_model.compile( prefill_seq_len=prompt_len, ctx_len=ctx_len, @@ -232,14 +223,10 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.CTX_LEN, full_batch_size, ) - - if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: + if model_name not in ModelConfig.SWIFTKV_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf) pytorch_hf_tokens = np.vstack(pytorch_hf_tokens) - if model_name in ModelConfig.EXTERNAL_MODELS: - pytorch_hf_tokens = [pytorch_hf_tokens for _ in range(full_batch_size)] - qeff_model = QEFFAutoModelForCausalLM( model_hf, continuous_batching=True, is_tlm=is_tlm, pretrained_model_name_or_path=model_name ) @@ -261,7 +248,6 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( qnn_config=qnn_config, ) exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) - if model_name in ModelConfig.SWIFTKV_MODELS: assert all( [ @@ -315,25 +301,19 @@ def test_causal_lm_export_with_deprecated_api(model_name): @pytest.mark.on_qaic @pytest.mark.regular @pytest.mark.parametrize("model_name", test_models_causal) -def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, custom_causal_model_config_dict): +def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ Test function to validate the dummy PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - config = custom_causal_model_config_dict.get(model_name) - - # Using fixed reference tokens for external models for specific test cases. - # These tokens are hardcoded, therefore will not match if the model config changes. - pytorch_hf_tokens = None - if model_name in ModelConfig.EXTERNAL_MODELS: - pytorch_hf_tokens = ModelConfig.EXTERNAL_MODELS[model_name]["pytorch_hf_tokens_custom_case"] + hf_config = get_hf_config_from_custom_config(model_name) if model_name in ModelConfig.QUANTIZED_MODELS: n_layer = get_custom_n_layers(model_name) - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer, pytorch_hf_tokens=pytorch_hf_tokens) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, n_layer=n_layer) else: - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=config, pytorch_hf_tokens=pytorch_hf_tokens) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, config=hf_config) @pytest.mark.nightly @@ -347,34 +327,26 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ n_layer = get_custom_n_layers(model_name) - # Using fixed reference tokens for external models for specific test cases. - # These tokens are hardcoded, therefore will not match if the model config changes. - pytorch_hf_tokens = None - if model_name in ModelConfig.EXTERNAL_MODELS: - pytorch_hf_tokens = ModelConfig.EXTERNAL_MODELS[model_name]["pytorch_hf_tokens_normal_case"] - - check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, n_layer=n_layer, pytorch_hf_tokens=pytorch_hf_tokens - ) + check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100(model_name=model_name, n_layer=n_layer) @pytest.mark.on_qaic @pytest.mark.regular @pytest.mark.qnn @pytest.mark.parametrize("model_name", test_models_qnn) -def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, custom_causal_model_config_dict): +def test_custom_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): """ QNN Setup Test function to validate the dummy PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - config = custom_causal_model_config_dict.get(model_name) + hf_config = get_hf_config_from_custom_config(model_name) qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, enable_qnn=True, qnn_config=qnn_config_json_path, config=config + model_name, enable_qnn=True, qnn_config=qnn_config_json_path, config=hf_config ) @@ -402,18 +374,18 @@ def test_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name): @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.parametrize("model_name", test_models_spd) -def test_custom_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name, custom_causal_model_config_dict): +def test_custom_causal_tlm_pytorch_vs_kv_vs_ort_vs_ai100(model_name): """ Test function to validate the dummy PyTorch model for speculative decoding, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, both with and without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - config = custom_causal_model_config_dict.get(model_name) + hf_config = get_hf_config_from_custom_config(model_name) check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, num_speculative_tokens=Constants.NUM_SPECULATIVE_TOKENS, - config=config, + config=hf_config, ) diff --git a/tests/transformers/models/test_embedding_models.py b/tests/transformers/models/test_embedding_models.py index 2d110faeb..e9a636d71 100644 --- a/tests/transformers/models/test_embedding_models.py +++ b/tests/transformers/models/test_embedding_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os from typing import Optional @@ -19,10 +20,11 @@ from QEfficient.utils._utils import create_json from QEfficient.utils.constants import Constants, QnnConstants -embed_test_models = [ - {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"}, - {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"}, -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + embed_test_models = config_data["embedding_models"] def check_embed_pytorch_vs_ort_vs_ai100( diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index a7b4162aa..673d60f96 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -5,6 +5,7 @@ # # ---------------------------------------------------------------------------- +import json import os from io import BytesIO from typing import List, Optional @@ -32,167 +33,21 @@ from QEfficient.utils.test_utils import InternProcessor NEW_GENERATION_TOKENS = 10 -test_models_config = [ - # CONFIG PARAMS NEEDED FOR A MODEL TO BE TESTED - # ( - # model_name, - # kv_offload, - # batch_size, - # prompt_len, - # ctx_len, - # img_size, - # img_url", - # text_prompt, - # number of layers of the model, - # ), - ( - "llava-hf/llava-1.5-7b-hf", - True, - 1, - 784, - 1024, - 336, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - 1, - ), - ( - "llava-hf/llava-1.5-7b-hf", - False, - 1, - 784, - 1024, - 336, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - 1, - ), - # Disabled in CI due to performance issues - # ( - # "meta-llama/Llama-4-Scout-17B-16E-Instruct", - # True, - # 1, - # 128, - # 3072, - # 336, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - # 4, - # ), - # ( - # "meta-llama/Llama-4-Scout-17B-16E-Instruct", - # False, - # 1, - # 128, - # 3072, - # 336, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", - # "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", - # 4, - # ), - ( - "google/gemma-3-4b-it", - True, - 1, - 128, - 3072, - 896, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "google/gemma-3-4b-it", - False, - 1, - 128, - 3072, - 896, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "mistralai/Mistral-Small-3.1-24B-Instruct-2503", - True, - 1, - 128, - 4096, - 1540, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - ( - "mistralai/Mistral-Small-3.1-24B-Instruct-2503", - False, - 1, - 128, - 4096, - 1540, - "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", - "Can you describe the image in detail.", - 1, - ), - # ( - # "meta-llama/Llama-3.2-11B-Vision-Instruct", - # True, - # 1, - # 32, - # 512, - # 560, - # "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", - # "Explain this image", - # 7, - # ), -] - -intern_model_config = [ - ( - "OpenGVLab/InternVL2_5-1B", - True, - 1, - 384, - 512, - "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - "Please describe the image in detail.", - 2, - ), - ( - "OpenGVLab/InternVL3_5-1B", - True, - 1, - 384, - 512, - "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - "Please describe the image in detail.", - 2, - ), - # ( - # "OpenGVLab/InternVL2_5-1B", - # False, - # 1, - # 384, - # 512, - # "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", - # "Please describe the image in detail.", - # 2, - # ), # commented becuase QNN Convertor is not supported for this model yet. -] - -molmo_model_config = [ - # Disabled in CI due to HF issues - # ( - # "allenai/Molmo-7B-D-0924", - # True, - # 1, - # 128, - # 4096, - # "https://picsum.photos/id/237/536/354", - # "Can you describe the image in detail.", - # 2, - # ), -] + +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + multimodal_models = config_data["multimodal_models"] + intern_models = config_data["intern_models"] + +test_mm_models = [model_config["model_name"] for model_config in multimodal_models] +test_intern_models = [model_config["model_name"] for model_config in intern_models] + +test_mm_models_config = {model["model_name"]: model for model in multimodal_models} +test_intern_config = {model["model_name"]: model for model in intern_models} + +model_config_dict = {**test_mm_models_config, **test_intern_config} def load_image_text_to_text_model(model_config): @@ -218,6 +73,28 @@ def load_image_text_to_text_model(model_config): return model_hf, params +def load_image_text_to_text_model_from_config(model_name, config): + torch.manual_seed(42) + model_path = hf_download( + repo_id=model_name, + ignore_patterns=["*.onnx", "*.ot", "*.md", "*.tflite", "*.pdf", "*.h5", "*.msgpack"], + ) + try: + model_hf = AutoModelForImageTextToText.from_config( + config, + ) + except ValueError: + model_hf = AutoModelForCausalLM.from_pretrained( + model_path, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, + ) + params = sum(p.numel() for p in model_hf.parameters()) + model_hf.eval() + return model_hf, params + + def set_num_layers(config, n_layer=1): ## -1 indicates use all the layers of the model. if n_layer == -1: @@ -252,14 +129,16 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( num_devices: int = 1, enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, + config: Optional[AutoConfig] = None, ): - model_config = {"model_name": model_name} - model_config["img_size"] = img_size - config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True, padding=True) - config = set_num_layers(config, n_layer=n_layer) - model_hf, _ = load_image_text_to_text_model(config) - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + if config is None: + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, padding=True) + config = set_num_layers(config, n_layer=n_layer) + model_hf, _ = load_image_text_to_text_model(config) + else: + model_hf, _ = load_image_text_to_text_model_from_config(model_name, config) + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) n_layer = get_num_layers_vlm(config) image = Image.open(requests.get(img_url, stream=True).raw) if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": @@ -293,25 +172,12 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) streamer = TextStreamer(processor.tokenizer) pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs) - qeff_model = QEFFAutoModelForImageTextToText.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, - ) - - # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model) - # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), ( - # "Tokens don't match for pytorch HF output and pytorch KV output" - # ) + qeff_model = QEFFAutoModelForImageTextToText(model_hf, kv_offload=kv_offload) qeff_model.export() - # onnx_model_path = qeff_model.export() - # ort_tokens = api_runner.run_vlm_kv_model_on_ort(onnx_model_path) - # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") + qeff_model.compile( - img_size=model_config["img_size"], + img_size=img_size, num_devices=num_devices, prefill_seq_len=prompt_len, ctx_len=ctx_len, @@ -491,8 +357,7 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( # onnx_model_path = qeff_model.export() # ort_tokens = api_runner.run_vlm_kv_model_on_ort(onnx_model_path) # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") + qeff_model.compile( num_patches=1, num_devices=num_devices, @@ -511,27 +376,26 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config -) -def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_mm_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ + if model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct": + pytest.skip("Performance issue: Skipping the test for Llama-4-Scout-17B-16E-Instruct model.") check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_size=img_size, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_size=model_config_dict[model_name]["img_size"], + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, ) @@ -539,12 +403,9 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer", test_models_config -) -def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_size, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_mm_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, kv_offload): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. ``Mandatory`` Args: @@ -558,14 +419,14 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_size=img_size, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_size=model_config_dict[model_name]["img_size"], + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, enable_qnn=True, qnn_config=qnn_config_json_path, @@ -574,42 +435,20 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn( @pytest.mark.on_qaic @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", molmo_model_config -) -def test_image_text_to_text_molmo_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): - check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, - max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, - kv_offload=kv_offload, - ) - - -@pytest.mark.on_qaic -@pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config -) -def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_intern_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload): + if not kv_offload: + pytest.skip("Single Qpc is not supported for InternVL without kv_offload.") check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, ) @@ -617,24 +456,23 @@ def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal -@pytest.mark.parametrize( - "model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer", intern_model_config -) -def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100_qnn( - model_name, kv_offload, batch_size, prompt_len, ctx_len, img_url, query, n_layer -): +@pytest.mark.parametrize("model_name", test_intern_models) +@pytest.mark.parametrize("kv_offload", [True, False]) +def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, kv_offload): + if not kv_offload: + pytest.skip("Single Qpc is not supported for InternVL without kv_offload.") qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, - prompt_len=prompt_len, - ctx_len=ctx_len, + prompt_len=model_config_dict[model_name]["prompt_len"], + ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_url=img_url, - query=query, - n_layer=n_layer, - batch_size=batch_size, + img_url=model_config_dict[model_name]["img_url"], + query=model_config_dict[model_name]["text_prompt"], + n_layer=model_config_dict[model_name]["num_layers"], + batch_size=model_config_dict[model_name]["batch_size"], kv_offload=kv_offload, enable_qnn=True, qnn_config=qnn_config_json_path, diff --git a/tests/transformers/models/test_model_configs.json b/tests/transformers/models/test_model_configs.json new file mode 100644 index 000000000..63cb429d5 --- /dev/null +++ b/tests/transformers/models/test_model_configs.json @@ -0,0 +1,532 @@ +{ + "causal_lm_models": [ + { + "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "gpt2", + "model_type": "gpt2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50257, + "num_key_value_heads": 1 + } + }, + { + "model_name": "allenai/OLMo-2-0425-1B", + "model_type": "olmo2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 100352, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Salesforce/codegen-350M-mono", + "model_type": "codegen", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 4, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 51200, + "num_key_value_heads": 1, + "rotary_dim": 16 + } + }, + + { + "model_name": "microsoft/Phi-3-mini-4k-instruct", + "model_type": "phi3", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32064, + "num_key_value_heads": 1 + } + }, + { + "model_name": "tiiuae/falcon-7b", + "model_type": "falcon", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 65024, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Qwen/Qwen3-30B-A3B-Instruct-2507", + "model_type": "qwen3_moe", + "additional_params": { + "hidden_size": 256, + "intermediate_size": 256, + "max_position_embeddings": 128, + "max_window_layers": 48, + "moe_intermediate_size": 768, + "num_attention_heads": 2, + "num_experts": 4, + "num_experts_per_tok": 2, + "num_hidden_layers": 1, + "num_key_value_heads": 1, + "vocab_size": 151936 + } + }, + { + "model_name": "Qwen/Qwen2-0.5B", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936, + "num_key_value_heads": 1 + } + }, + { + "model_name": "bigcode/starcoder2-3b", + "model_type": "starcoder2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49152, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Felladrin/Minueza-32M-Base", + "model_type": "mistral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32002, + "num_key_value_heads": 1 + } + }, + { + "model_name": "wtang06/mpt-125m-c4", + "model_type": "mpt", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50368 + } + }, + { + "model_name": "hakurei/gpt-j-random-tinier", + "model_type": "gptj", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50400, + "num_key_value_heads": 1, + "rotary_dim": 16 + } + }, + { + "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_type": "mixtral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "meta-llama/Llama-3.2-1B", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + }, + { + "model_name": "unsloth/gemma-2b", + "model_type": "gemma", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "unsloth/gemma-2-2b", + "model_type": "gemma2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32003 + } + }, + { + "model_name": "TheBloke/Llama-2-7B-GPTQ", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000 + } + }, + { + "model_name": "ibm-granite/granite-20b-code-base", + "model_type": "gpt_bigcode", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49152, + "num_key_value_heads": 1, + "activation_function": "gelu", + "architectures": [ + "GPTBigCodeForCausalLM" + ] + } + }, + { + "model_name": "neuralmagic/Llama-3.2-3B-Instruct-FP8", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256 + } + }, + { + "model_name": "neuralmagic/Qwen2-0.5B-Instruct-FP8", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936 + } + }, + { + "model_name": "ibm-granite/granite-3.1-2b-instruct", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + }, + { + "model_name": "ibm-granite/granite-guardian-3.1-2b", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + }, + { + "model_name": "hpcai-tech/grok-1", + "model_type": null, + "additional_params":{ + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 131072, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Snowflake/Llama-3.1-SwiftKV-8B-Instruct", + "model_type": null, + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 2, + "num_attention_heads": 2, + "hidden_size": 256, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_layers": 1, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + } + ], + "multimodal_models": [ + { + "model_name": "llava-hf/llava-1.5-7b-hf", + "model_type": "llava", + "batch_size": 1, + "prompt_len": 784, + "ctx_len": 1024, + "img_size": 336, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + "num_layers": 1, + "additional_params": { + } + }, + { + "model_name": "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "model_type": "llama4", + "batch_size": 1, + "prompt_len": 32, + "ctx_len": 3072, + "img_size": 336, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", + "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", + "num_layers": 4, + "additional_params": { + } + }, + { + "model_name": "google/gemma-3-4b-it", + "model_type": "gemma3", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 3072, + "img_size": 896, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "additional_params": { + } + }, + { + "model_name": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", + "model_type": "mistral3", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 4096, + "img_size": 1540, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "additional_params": { + } + } + + ], + "intern_models": [ + { + "model_name": "OpenGVLab/InternVL2_5-1B", + "batch_size": 1, + "prompt_len": 384, + "ctx_len": 512, + "img_url": "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", + "text_prompt": "Please describe the image in detail.", + "num_layers": 2, + "additional_params": { + } + } + ], + "speech_seq2seq_models": [ + "openai/whisper-tiny" + ], + "embedding_models": [ + {"model_name": "jinaai/jina-embeddings-v2-base-code", "pooling": "mean"}, + {"model_name": "sentence-transformers/nli-bert-base-cls-pooling", "pooling": "cls"} + ], + "spd_causal_lm_models": [ + { + "model_name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "Qwen/Qwen2-0.5B", + "model_type": "qwen2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 151936, + "num_key_value_heads": 1 + } + } + ], + "qnn_causal_lm_models": [ + { + "model_name": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_type": "mixtral", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 32000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "meta-llama/Llama-3.2-1B", + "model_type": "llama", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 128256, + "num_key_value_heads": 1, + "rope_scaling": { + "factor": 32.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + } + } + }, + { + "model_name": "unsloth/gemma-2b", + "model_type": "gemma", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 256000, + "num_key_value_heads": 1 + } + }, + { + "model_name": "ibm-granite/granite-guardian-3.1-2b", + "model_type": "granite", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 49155, + "num_key_value_heads": 1 + } + } + ], + "prefix_caching_models": [ + { + "model_name": "gpt2", + "model_type": "gpt2", + "additional_params": { + "max_position_embeddings": 128, + "num_hidden_layers": 1, + "num_attention_heads": 2, + "hidden_size": 64, + "intermediate_size": 256, + "vocab_size": 50257, + "num_key_value_heads": 1 + } + } + ], + "audio_embedding_models": [ + "facebook/wav2vec2-base-960h" + ] +} diff --git a/tests/transformers/models/test_prefix_caching.py b/tests/transformers/models/test_prefix_caching.py index 88862fce7..a9662cc73 100644 --- a/tests/transformers/models/test_prefix_caching.py +++ b/tests/transformers/models/test_prefix_caching.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os import numpy as np @@ -16,7 +17,13 @@ from QEfficient.utils._utils import create_json from QEfficient.utils.constants import QnnConstants -test_models = ["gpt2"] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + prefix_caching_models = config_data["prefix_caching_models"] + +test_models = [model["model_name"] for model in prefix_caching_models] # The test should first generate output with some prefix+suffix1 or batch_id and then confirm that we are still able to execute of prefix+suffix2 on same batch id and getting correct output. diff --git a/tests/transformers/models/test_speech_seq2seq_models.py b/tests/transformers/models/test_speech_seq2seq_models.py index 4ae8928b7..52a96d7fe 100644 --- a/tests/transformers/models/test_speech_seq2seq_models.py +++ b/tests/transformers/models/test_speech_seq2seq_models.py @@ -5,6 +5,7 @@ # # ----------------------------------------------------------------------------- +import json import os from importlib import reload from typing import List, Optional @@ -25,9 +26,11 @@ from QEfficient.utils.constants import Constants, QnnConstants from QEfficient.utils.device_utils import get_available_device_id -test_models = [ - "openai/whisper-tiny", -] +CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") + +with open(CONFIG_PATH, "r") as f: + config_data = json.load(f) + test_models = config_data["speech_seq2seq_models"] def load_seq2seq_model(model_config): From d7e045e5c7f8c74a2452ec31f2fa86d0c71aab70 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Thu, 23 Oct 2025 08:11:36 +0000 Subject: [PATCH 2/7] Conflict Resolved Signed-off-by: Abukhoyer Shaik --- tests/transformers/models/test_causal_lm_models.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index 81f710f09..369c9ac8d 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -165,7 +165,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.CTX_LEN, ) - if model_name not in ModelConfig.SWIFTKV_MODELS: + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch(model_hf) is_tlm = False if num_speculative_tokens is None else True @@ -174,7 +174,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( ) pytorch_kv_tokens = api_runner.run_kv_model_on_pytorch(qeff_model.model) - if model_name not in ModelConfig.SWIFTKV_MODELS: + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( "Tokens don't match for HF PyTorch model output and KV PyTorch model output" ) @@ -223,7 +223,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( Constants.CTX_LEN, full_batch_size, ) - if model_name not in ModelConfig.SWIFTKV_MODELS: + if model_name not in ModelConfig.SWIFTKV_MODELS and model_name not in ModelConfig.EXTERNAL_MODELS: pytorch_hf_tokens = api_runner.run_hf_model_on_pytorch_CB(model_hf) pytorch_hf_tokens = np.vstack(pytorch_hf_tokens) @@ -248,7 +248,7 @@ def check_causal_lm_pytorch_vs_kv_vs_ort_vs_ai100( qnn_config=qnn_config, ) exec_info_fbs = qeff_model.generate(tokenizer, prompts=fbs_prompts) - if model_name in ModelConfig.SWIFTKV_MODELS: + if model_name in ModelConfig.SWIFTKV_MODELS or model_name in ModelConfig.EXTERNAL_MODELS: assert all( [ all(ort_token[:24] == cloud_token[:24]) From cbf7325b4bcced13594224ba11390a600983b559 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Tue, 2 Dec 2025 09:47:41 +0000 Subject: [PATCH 3/7] combined the image models checking functions Signed-off-by: Abukhoyer Shaik --- .../models/test_image_text_to_text_models.py | 475 ++++++++---------- .../models/test_model_configs.json | 34 +- 2 files changed, 218 insertions(+), 291 deletions(-) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index 95b34a76d..e4150a930 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -39,15 +39,12 @@ with open(CONFIG_PATH, "r") as f: config_data = json.load(f) multimodal_models = config_data["multimodal_models"] - intern_models = config_data["intern_models"] test_mm_models = [model_config["model_name"] for model_config in multimodal_models] -test_intern_models = [model_config["model_name"] for model_config in intern_models] test_mm_models_config = {model["model_name"]: model for model in multimodal_models} -test_intern_config = {model["model_name"]: model for model in intern_models} -model_config_dict = {**test_mm_models_config, **test_intern_config} +model_config_dict = {**test_mm_models_config} @@ -118,7 +115,6 @@ def set_num_layers(config, n_layer=1): def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name: str, - img_size: int, img_url: str, query: str, prompt_len: int, @@ -131,247 +127,211 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( enable_qnn: Optional[bool] = False, qnn_config: Optional[str] = None, config: Optional[AutoConfig] = None, + img_size: Optional[int] = None, ): + """ + Unified function to test PyTorch model, PyTorch KV model, ONNX model, and Cloud AI 100 model. + Handles standard VLM models, InternVL models, and Molmo models. + + Args: + model_name: Hugging Face model identifier + img_url: URL to image for testing + query: Text query for the model + prompt_len: Prompt sequence length + ctx_len: Context length + max_gen_len: Maximum generation length + batch_size: Batch size for processing + n_layer: Number of layers to use + kv_offload: Whether to use KV offloading + num_devices: Number of devices to use + enable_qnn: Enable QNN compilation + qnn_config: Path to QNN config file + config: Pre-configured model config (optional) + img_size: Image size for standard models (optional) + """ + + is_intern_model = model_name == "OpenGVLab/InternVL2_5-1B" + is_molmo_model = model_name == "allenai/Molmo-7B-D-0924" + + # ========== Config and Model Loading ========== if config is None: - config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, padding=True) + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, padding=not is_molmo_model) + config._attn_implementation = "eager" if (is_intern_model or is_molmo_model) else None config = set_num_layers(config, n_layer=n_layer) + + if is_intern_model: + model_hf = AutoModelForCausalLM.from_pretrained( + model_name, + low_cpu_mem_usage=False, + trust_remote_code=True, + config=config, + ) + elif is_molmo_model: model_hf, _ = load_image_text_to_text_model(config) + n_layer = (n_layer, n_layer) else: - model_hf, _ = load_image_text_to_text_model_from_config(model_name, config) + model_hf, _ = load_image_text_to_text_model(config) - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + n_layer = get_num_layers_vlm(config) - image = Image.open(requests.get(img_url, stream=True).raw) - if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": - image = image.resize((1540, 1540)) - - conversation = [ - { - "role": "user", - "content": [ - {"type": "text", "text": query}, - {"type": "image"}, - ], - }, - ] - prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) - api_runner = ApiRunnerVlm( - batch_size, - processor, - config, - image, - conversation, - prompt, - prompt_len, - ctx_len, - max_gen_len, - n_layer, - ) - - inputs = processor(images=image, text=prompt, return_tensors="pt") - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - streamer = TextStreamer(processor.tokenizer) - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs) - qeff_model = QEFFAutoModelForImageTextToText(model_hf, kv_offload=kv_offload) - - qeff_model.export() - - qeff_model.compile( - img_size=img_size, - num_devices=num_devices, - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - mxfp6=False, - enable_qnn=enable_qnn, - qnn_config=qnn_config, - ) - inputs = processor(images=image, text=prompt, return_tensors="pt") - if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": - inputs = qeff_model.model.prepare_inputs_for_generation( - inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size + + # ========== Processor and Image Loading ========== + if is_intern_model: + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) + processor = InternProcessor(model_hf, tokenizer) + else: + processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) + + if is_intern_model: + prompt = [query] + img_url_list = [img_url] + pixel_values = [] + num_patches_list = [] + questions = [] + for i in range(len(prompt)): + img = requests.get(img_url_list[i], stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + image = image.resize((448, 448)) + pixel_value = processor.load_image(image, max_num=12) + num_patches_list.append(pixel_value.shape[0]) + pixel_values.append(pixel_value) + question = "\n" + prompt[i] + questions.append(question) + pixel_values = torch.cat(pixel_values, dim=0) + else: + if is_molmo_model: + img = requests.get(img_url, stream=True) + image = Image.open(BytesIO(img.content)).convert("RGB") + image = image.resize((536, 354)) + else: + image = Image.open(requests.get(img_url, stream=True).raw) + if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": + image = image.resize((1540, 1540)) + + # ========== Prepare Inputs and Get PyTorch HF Tokens ========== + if is_intern_model: + messages: List[List[str]] = [] + roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") + prompt = processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list) + inputs = tokenizer(prompt, return_tensors="pt") + batch_size, prompt_len = inputs["input_ids"].shape + inputs["pixel_values"] = pixel_values.clone() + generation_config = dict(max_new_tokens=max_gen_len, do_sample=False) + generation_config["eos_token_id"] = tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) + api_runner = ApiRunnerInternVL( + batch_size, + processor, + config, + image, + query, + prompt_len, + ctx_len, + max_gen_len, + n_layer, ) - if "pixel_values" in inputs: - inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - print("QPC Outputs (QAIC):") - output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) - qpc_tokens = output.generated_ids[:, :-1] - assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" - return - - -def check_molmo_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name: str, - img_url: str, - query: str, - prompt_len: int, - ctx_len: int, - max_gen_len: int = 20, - batch_size: int = 1, - n_layer: int = 1, - kv_offload: bool = False, - num_devices: int = 1, - enable_qnn: Optional[bool] = False, - qnn_config: Optional[str] = None, -): - model_config = {"model_name": model_name} - - config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) - config._attn_implementation = "eager" - config = set_num_layers(config, n_layer=n_layer) - model_hf, _ = load_image_text_to_text_model(config) - n_layer = (n_layer, n_layer) - - processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) - img = requests.get(img_url, stream=True) - image = Image.open(BytesIO(img.content)).convert("RGB") - image = image.resize((536, 354)) - - api_runner = ApiRunnerMolmo( - batch_size, - processor, - config, - image, - query, - prompt_len, - ctx_len, - max_gen_len, - n_layer, - ) - - inputs = processor.process(images=[image], text=query) - inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} - - generation_config = GenerationConfig(max_new_tokens=NEW_GENERATION_TOKENS, stop_strings="<|endoftext|>") - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config) - - batch_size, prompt_len = inputs["input_ids"].shape - inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) - valid = inputs["image_input_idx"] > 0 - valid = valid.reshape(1, -1) - inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0) - inputs["pixel_values"] = inputs.pop("images") - - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, - ) - - streamer = TextStreamer(processor.tokenizer) - qeff_model.export() - - if not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") - qeff_model.compile(num_devices=num_devices, prefill_seq_len=prompt_len, ctx_len=ctx_len, mxfp6=False) - print("QPC Outputs (QAIC):") - output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) - qpc_tokens = output.generated_ids[:, :-1] - assert (pytorch_hf_tokens == qpc_tokens).all(), "Tokens don't match for pytorch HF output and QPC output" - return - - -def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name: str, - img_url: str, - query: str, - prompt_len: int, - ctx_len: int, - max_gen_len: int = 20, - batch_size: int = 1, - n_layer: int = 1, - kv_offload: bool = False, - num_devices: int = 1, - enable_qnn: Optional[bool] = False, - qnn_config: Optional[str] = None, -): - model_config = {"model_name": model_name} - - config = AutoConfig.from_pretrained(model_config["model_name"], trust_remote_code=True) - config._attn_implementation = "eager" - config = set_num_layers(config, n_layer=n_layer) - model_hf = AutoModelForCausalLM.from_pretrained( - model_name, - low_cpu_mem_usage=False, - trust_remote_code=True, - config=config, - ) - n_layer = get_num_layers_vlm(config) - - tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) - processor = InternProcessor(model_hf, tokenizer) - - prompt = [query] - img_url = [img_url] - pixel_values = [] - num_patches_list = [] - questions = [] - for i in range(len(prompt)): - img = requests.get(img_url[i], stream=True) - image = Image.open(BytesIO(img.content)).convert("RGB") - - image = image.resize((448, 448)) - - # preprocess the resized image - pixel_value = processor.load_image(image, max_num=12) - num_patches_list.append(pixel_value.shape[0]) - pixel_values.append(pixel_value) - - question = "\n" + prompt[i] - questions.append(question) - - pixel_values = torch.cat(pixel_values, dim=0) - - # Chat Template information for prompt preprocessing - messages: List[List[str]] = [] - roles = ("<|im_start|>user\n", "<|im_start|>assistant\n") - prompt = processor(pixel_values, questions, messages, roles, num_patches_list=num_patches_list) - - inputs = tokenizer(prompt, return_tensors="pt") - batch_size, prompt_len = inputs["input_ids"].shape - inputs["pixel_values"] = pixel_values.clone() - - generation_config = dict(max_new_tokens=max_gen_len, do_sample=False) - generation_config["eos_token_id"] = tokenizer.convert_tokens_to_ids("<|im_end|>\n".strip()) - api_runner = ApiRunnerInternVL( - batch_size, - processor, - config, - image, - query, - prompt_len, - ctx_len, - max_gen_len, - n_layer, - ) - pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config) - - qeff_model = QEFFAutoModelForCausalLM.from_pretrained( - model_config["model_name"], - kv_offload=kv_offload, - config=config, - ) + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config) + elif is_molmo_model: + inputs = processor.process(images=[image], text=query) + inputs = {k: v.unsqueeze(0) for k, v in inputs.items()} + generation_config = GenerationConfig(max_new_tokens=NEW_GENERATION_TOKENS, stop_strings="<|endoftext|>") + api_runner = ApiRunnerMolmo( + batch_size, + processor, + config, + image, + query, + prompt_len, + ctx_len, + max_gen_len, + n_layer, + ) + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs, generation_config) + batch_size, prompt_len = inputs["input_ids"].shape + inputs["attention_mask"] = torch.ones((inputs["input_ids"].shape), dtype=torch.int64) + valid = inputs["image_input_idx"] > 0 + valid = valid.reshape(1, -1) + inputs["valid_idx"] = torch.nonzero(valid)[:, 1].unsqueeze(0) + inputs["pixel_values"] = inputs.pop("images") + else: + conversation = [ + { + "role": "user", + "content": [ + {"type": "text", "text": query}, + {"type": "image"}, + ], + }, + ] + prompt = processor.apply_chat_template(conversation, add_generation_prompt=True) + api_runner = ApiRunnerVlm( + batch_size, + processor, + config, + image, + conversation, + prompt, + prompt_len, + ctx_len, + max_gen_len, + n_layer, + ) + inputs = processor(images=image, text=prompt, return_tensors="pt") + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs) + # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model) - # assert (pytorch_hf_tokens == pytorch_kv_tokens).all(), ( - # "Tokens don't match for pytorch HF output and QEFF KV Model output" + # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), ( + # "Tokens don't match for pytorch HF output and pytorch KV output" # ) streamer = TextStreamer(processor.tokenizer) + + # ========== Export and Compile Model ========== + if is_intern_model or is_molmo_model: + qeff_model = QEFFAutoModelForCausalLM.from_pretrained( + model_name, + kv_offload=kv_offload, + config=config, + ) + else: + qeff_model = QEFFAutoModelForImageTextToText(model_hf, kv_offload=kv_offload) + qeff_model.export() # onnx_model_path = qeff_model.export() # ort_tokens = api_runner.run_vlm_kv_model_on_ort(onnx_model_path) # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" - - qeff_model.compile( - num_patches=1, - num_devices=num_devices, - prefill_seq_len=prompt_len, - ctx_len=ctx_len, - mxfp6=False, - enable_qnn=enable_qnn, - qnn_config=qnn_config, - ) + + compile_kwargs = { + "num_devices": num_devices, + "prefill_seq_len": prompt_len, + "ctx_len": ctx_len, + "mxfp6": False, + "enable_qnn": enable_qnn, + "qnn_config": qnn_config, + } + + if is_intern_model: + compile_kwargs["num_patches"] = 1 + elif not is_molmo_model and img_size is not None: + compile_kwargs["img_size"] = img_size + + qeff_model.compile(**compile_kwargs) + + # ========== Generate and Verify Output ========== + if is_molmo_model and not get_available_device_id(): + pytest.skip("No available devices to run model on Cloud AI 100") + + if not is_intern_model and not is_molmo_model: + inputs = processor(images=image, text=prompt, return_tensors="pt") + if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": + inputs = qeff_model.model.prepare_inputs_for_generation( + inputs=inputs, prefill_seq_len=prompt_len, batch_size=batch_size + ) + if "pixel_values" in inputs: + inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) + print("QPC Outputs (QAIC):") output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] @@ -382,7 +342,7 @@ def check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize("model_name", test_mm_models) -@pytest.mark.parametrize("kv_offload", [True, False]) +@pytest.mark.parametrize("kv_offload", [True]) def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. @@ -391,12 +351,16 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload """ if model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct": pytest.skip("Performance issue: Skipping the test for Llama-4-Scout-17B-16E-Instruct model.") + + # Get img_size for standard models, None for InternVL and Molmo + img_size = model_config_dict[model_name].get("img_size") + check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, prompt_len=model_config_dict[model_name]["prompt_len"], ctx_len=model_config_dict[model_name]["ctx_len"], max_gen_len=NEW_GENERATION_TOKENS, - img_size=model_config_dict[model_name]["img_size"], + img_size=img_size, img_url=model_config_dict[model_name]["img_url"], query=model_config_dict[model_name]["text_prompt"], n_layer=model_config_dict[model_name]["num_layers"], @@ -405,6 +369,9 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload ) +### QNN Tests ### + + @pytest.mark.on_qaic @pytest.mark.qnn @pytest.mark.multimodal @@ -436,49 +403,3 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, kv_off enable_qnn=True, qnn_config=qnn_config_json_path, ) - - -@pytest.mark.on_qaic -@pytest.mark.multimodal -@pytest.mark.parametrize("model_name", test_intern_models) -@pytest.mark.parametrize("kv_offload", [True, False]) -def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload): - if not kv_offload: - pytest.skip("Single Qpc is not supported for InternVL without kv_offload.") - check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, - prompt_len=model_config_dict[model_name]["prompt_len"], - ctx_len=model_config_dict[model_name]["ctx_len"], - max_gen_len=NEW_GENERATION_TOKENS, - img_url=model_config_dict[model_name]["img_url"], - query=model_config_dict[model_name]["text_prompt"], - n_layer=model_config_dict[model_name]["num_layers"], - batch_size=model_config_dict[model_name]["batch_size"], - kv_offload=kv_offload, - ) - - -@pytest.mark.on_qaic -@pytest.mark.qnn -@pytest.mark.multimodal -@pytest.mark.parametrize("model_name", test_intern_models) -@pytest.mark.parametrize("kv_offload", [True, False]) -def test_image_text_to_text_intern_pytorch_vs_kv_vs_ort_vs_ai100_qnn(model_name, kv_offload): - if not kv_offload: - pytest.skip("Single Qpc is not supported for InternVL without kv_offload.") - qnn_config_json_path = os.path.join(os.getcwd(), "qnn_config.json") - create_json(qnn_config_json_path, QnnConstants.QNN_SAMPLE_CONFIG) - - check_intern_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( - model_name=model_name, - prompt_len=model_config_dict[model_name]["prompt_len"], - ctx_len=model_config_dict[model_name]["ctx_len"], - max_gen_len=NEW_GENERATION_TOKENS, - img_url=model_config_dict[model_name]["img_url"], - query=model_config_dict[model_name]["text_prompt"], - n_layer=model_config_dict[model_name]["num_layers"], - batch_size=model_config_dict[model_name]["batch_size"], - kv_offload=kv_offload, - enable_qnn=True, - qnn_config=qnn_config_json_path, - ) diff --git a/tests/transformers/models/test_model_configs.json b/tests/transformers/models/test_model_configs.json index 63cb429d5..1183a05da 100644 --- a/tests/transformers/models/test_model_configs.json +++ b/tests/transformers/models/test_model_configs.json @@ -358,8 +358,7 @@ "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", "num_layers": 1, - "additional_params": { - } + "additional_params": {} }, { "model_name": "meta-llama/Llama-4-Scout-17B-16E-Instruct", @@ -371,8 +370,7 @@ "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/ai2d-demo.jpg", "text_prompt": "What does the label 15 represent? (1) lava (2) core (3) tunnel (4) ash cloud", "num_layers": 4, - "additional_params": { - } + "additional_params": {} }, { "model_name": "google/gemma-3-4b-it", @@ -384,8 +382,7 @@ "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", "text_prompt": "Can you describe the image in detail.", "num_layers": 1, - "additional_params": { - } + "additional_params": {} }, { "model_name": "mistralai/Mistral-Small-3.1-24B-Instruct-2503", @@ -397,22 +394,31 @@ "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/datasets/cat_style_layout.png", "text_prompt": "Can you describe the image in detail.", "num_layers": 1, - "additional_params": { - } - } - - ], - "intern_models": [ + "additional_params": {} + }, + { + "model_name": "allenai/Molmo-7B-D-0924", + "model_type": "molmo", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 4096, + "img_size": null, + "img_url": "https://picsum.photos/id/237/536/354", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 2, + "additional_params": {} + }, { "model_name": "OpenGVLab/InternVL2_5-1B", + "model_type": null, "batch_size": 1, "prompt_len": 384, "ctx_len": 512, + "img_size": null, "img_url": "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", "text_prompt": "Please describe the image in detail.", "num_layers": 2, - "additional_params": { - } + "additional_params": {} } ], "speech_seq2seq_models": [ From 29681abf7bc060a8f599378d3e1b48d39040603f Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Wed, 3 Dec 2025 05:08:34 +0000 Subject: [PATCH 4/7] comments incorporating Signed-off-by: Abukhoyer Shaik --- QEfficient/utils/constants.py | 2 -- .../models/qnn_config.json => qnn_config.json | 0 .../models/test_image_text_to_text_models.py | 21 +++++---------- .../models/test_model_configs.json | 27 ++++++++++++++++++- 4 files changed, 33 insertions(+), 17 deletions(-) rename tests/transformers/models/qnn_config.json => qnn_config.json (100%) diff --git a/QEfficient/utils/constants.py b/QEfficient/utils/constants.py index 463e13ebb..1504bdae5 100644 --- a/QEfficient/utils/constants.py +++ b/QEfficient/utils/constants.py @@ -17,8 +17,6 @@ ONNX_EXPORT_EXAMPLE_SEQ_LEN = 32 ONNX_EXPORT_EXAMPLE_FBS = 4 ONNX_EXPORT_EXAMPLE_NLK = 2 # Number of Logits to Keep -ONNX_EXPORT_OPSET = 17 - ONNX_EXPORT_MAX_NUM_IMAGES = 1 ONNX_EXPORT_MAX_IMAGE_TILES = 4 ONNX_EXPORT_IMAGE_WIDTH = 560 diff --git a/tests/transformers/models/qnn_config.json b/qnn_config.json similarity index 100% rename from tests/transformers/models/qnn_config.json rename to qnn_config.json diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index e4150a930..d7e6e0b99 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -28,7 +28,6 @@ from QEfficient.utils import hf_download from QEfficient.utils._utils import create_json, get_num_layers_vlm from QEfficient.utils.constants import QnnConstants -from QEfficient.utils.device_utils import get_available_device_id from QEfficient.utils.run_utils import ApiRunnerInternVL, ApiRunnerMolmo, ApiRunnerVlm from QEfficient.utils.test_utils import InternProcessor @@ -41,11 +40,7 @@ multimodal_models = config_data["multimodal_models"] test_mm_models = [model_config["model_name"] for model_config in multimodal_models] - -test_mm_models_config = {model["model_name"]: model for model in multimodal_models} - -model_config_dict = {**test_mm_models_config} - +model_config_dict = {model["model_name"]: model for model in multimodal_models} def load_image_text_to_text_model(model_config): @@ -166,15 +161,15 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( trust_remote_code=True, config=config, ) + n_layer = get_num_layers_vlm(config) + elif is_molmo_model: model_hf, _ = load_image_text_to_text_model(config) n_layer = (n_layer, n_layer) else: model_hf, _ = load_image_text_to_text_model(config) + n_layer = get_num_layers_vlm(config) - - n_layer = get_num_layers_vlm(config) - # ========== Processor and Image Loading ========== if is_intern_model: tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False) @@ -320,8 +315,6 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( qeff_model.compile(**compile_kwargs) # ========== Generate and Verify Output ========== - if is_molmo_model and not get_available_device_id(): - pytest.skip("No available devices to run model on Cloud AI 100") if not is_intern_model and not is_molmo_model: inputs = processor(images=image, text=prompt, return_tensors="pt") @@ -342,15 +335,15 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( @pytest.mark.on_qaic @pytest.mark.multimodal @pytest.mark.parametrize("model_name", test_mm_models) -@pytest.mark.parametrize("kv_offload", [True]) +@pytest.mark.parametrize("kv_offload", [True, False]) def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload): """ Test function to validate the PyTorch model, the PyTorch model after KV changes, the ONNX model, and the Cloud AI 100 model, without continuous batching. ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - if model_name == "meta-llama/Llama-4-Scout-17B-16E-Instruct": - pytest.skip("Performance issue: Skipping the test for Llama-4-Scout-17B-16E-Instruct model.") + if model_name in ["meta-llama/Llama-4-Scout-17B-16E-Instruct", "allenai/Molmo-7B-D-0924", "meta-llama/Llama-3.2-11B-Vision-Instruct"]: + pytest.skip("Test skipped for this model due to some issues.") # Get img_size for standard models, None for InternVL and Molmo img_size = model_config_dict[model_name].get("img_size") diff --git a/tests/transformers/models/test_model_configs.json b/tests/transformers/models/test_model_configs.json index 1183a05da..df7e90e15 100644 --- a/tests/transformers/models/test_model_configs.json +++ b/tests/transformers/models/test_model_configs.json @@ -396,7 +396,19 @@ "num_layers": 1, "additional_params": {} }, - { + { + "model_name": "Qwen/Qwen2.5-VL-3B-Instruct", + "model_type": "qwen2_5_vl", + "batch_size": 1, + "prompt_len": 128, + "ctx_len": 4096, + "img_size": 1540, + "img_url": "https://picsum.photos/id/237/536/354", + "text_prompt": "Can you describe the image in detail.", + "num_layers": 1, + "additional_params": {} + }, + { "model_name": "allenai/Molmo-7B-D-0924", "model_type": "molmo", "batch_size": 1, @@ -419,7 +431,20 @@ "text_prompt": "Please describe the image in detail.", "num_layers": 2, "additional_params": {} + }, + { + "model_name": "meta-llama/Llama-3.2-11B-Vision-Instruct", + "model_type": "mllama", + "batch_size": 1, + "prompt_len": 32, + "ctx_len": 512, + "img_size": 560, + "img_url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg", + "text_prompt": "Explain this image", + "num_layers": 7, + "additional_params": {} } + ], "speech_seq2seq_models": [ "openai/whisper-tiny" From 508f6a88afbc5da89b636249181b2744bc0a1cfb Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Wed, 3 Dec 2025 07:17:52 +0000 Subject: [PATCH 5/7] comments Signed-off-by: Abukhoyer Shaik --- qnn_config.json | 10 ---------- .../models/test_image_text_to_text_models.py | 3 ++- tests/transformers/models/test_model_configs.json | 14 +++++++++++++- 3 files changed, 15 insertions(+), 12 deletions(-) delete mode 100644 qnn_config.json diff --git a/qnn_config.json b/qnn_config.json deleted file mode 100644 index b1f249e2b..000000000 --- a/qnn_config.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "SKIP_QNN_CONVERTER_STEP":false, - "context_binary_generator_args_extension":"--log_level debug", - "converter_args_extension":"--onnx_defer_loading", - "qnn_compilation_backend":{ - "compiler_enable_depth_first":true, - "compiler_printDDRStats":false, - "compiler_printPerfMetrics":false - } -} \ No newline at end of file diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index d7e6e0b99..f55fa95f1 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -344,7 +344,8 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload """ if model_name in ["meta-llama/Llama-4-Scout-17B-16E-Instruct", "allenai/Molmo-7B-D-0924", "meta-llama/Llama-3.2-11B-Vision-Instruct"]: pytest.skip("Test skipped for this model due to some issues.") - + if model_name in ["OpenGVLab/InternVL2_5-1B", "OpenGVLab/InternVL3_5-1B","Qwen/Qwen2.5-VL-3B-Instruct"] and not kv_offload: + pytest.skip("These models require kv_offload=True for testing.") # Get img_size for standard models, None for InternVL and Molmo img_size = model_config_dict[model_name].get("img_size") diff --git a/tests/transformers/models/test_model_configs.json b/tests/transformers/models/test_model_configs.json index df7e90e15..d75eee0c1 100644 --- a/tests/transformers/models/test_model_configs.json +++ b/tests/transformers/models/test_model_configs.json @@ -422,7 +422,19 @@ }, { "model_name": "OpenGVLab/InternVL2_5-1B", - "model_type": null, + "model_type": "internvl_chat", + "batch_size": 1, + "prompt_len": 384, + "ctx_len": 512, + "img_size": null, + "img_url": "https://image.slidesharecdn.com/azureintroduction-191206101932/75/Introduction-to-Microsoft-Azure-Cloud-1-2048.jpg", + "text_prompt": "Please describe the image in detail.", + "num_layers": 2, + "additional_params": {} + }, + { + "model_name": "OpenGVLab/InternVL3_5-1B", + "model_type": "internvl_chat", "batch_size": 1, "prompt_len": 384, "ctx_len": 512, From 4d102ee04a5c42826561904fe73ec3efeb12234b Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Wed, 3 Dec 2025 07:43:14 +0000 Subject: [PATCH 6/7] comments Signed-off-by: Abukhoyer Shaik --- tests/transformers/models/test_image_text_to_text_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index f55fa95f1..aa9fe2e3a 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -145,7 +145,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( img_size: Image size for standard models (optional) """ - is_intern_model = model_name == "OpenGVLab/InternVL2_5-1B" + is_intern_model = model_name == "OpenGVLab/InternVL2_5-1B" or model_name == "OpenGVLab/InternVL3_5-1B" is_molmo_model = model_name == "allenai/Molmo-7B-D-0924" # ========== Config and Model Loading ========== From 690888f8da79674bb3da537af1f620870f972421 Mon Sep 17 00:00:00 2001 From: Abukhoyer Shaik Date: Wed, 3 Dec 2025 07:48:31 +0000 Subject: [PATCH 7/7] comments Signed-off-by: Abukhoyer Shaik --- .../models/test_causal_lm_models.py | 2 - .../models/test_image_text_to_text_models.py | 43 +++++++++++-------- 2 files changed, 25 insertions(+), 20 deletions(-) diff --git a/tests/transformers/models/test_causal_lm_models.py b/tests/transformers/models/test_causal_lm_models.py index a27351925..b9d573775 100644 --- a/tests/transformers/models/test_causal_lm_models.py +++ b/tests/transformers/models/test_causal_lm_models.py @@ -25,7 +25,6 @@ from QEfficient.utils.run_utils import ApiRunner from QEfficient.utils.test_utils import ModelConfig - CONFIG_PATH = os.path.join(os.path.dirname(__file__), "test_model_configs.json") with open(CONFIG_PATH, "r") as f: @@ -62,7 +61,6 @@ def get_hf_config_from_custom_config(model_name): return hf_config - def get_custom_n_layers(model_name): """ Function to set number layers of the variuos types of models such as swiftkv models and others diff --git a/tests/transformers/models/test_image_text_to_text_models.py b/tests/transformers/models/test_image_text_to_text_models.py index aa9fe2e3a..2d5500d08 100644 --- a/tests/transformers/models/test_image_text_to_text_models.py +++ b/tests/transformers/models/test_image_text_to_text_models.py @@ -127,7 +127,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( """ Unified function to test PyTorch model, PyTorch KV model, ONNX model, and Cloud AI 100 model. Handles standard VLM models, InternVL models, and Molmo models. - + Args: model_name: Hugging Face model identifier img_url: URL to image for testing @@ -144,16 +144,16 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( config: Pre-configured model config (optional) img_size: Image size for standard models (optional) """ - + is_intern_model = model_name == "OpenGVLab/InternVL2_5-1B" or model_name == "OpenGVLab/InternVL3_5-1B" is_molmo_model = model_name == "allenai/Molmo-7B-D-0924" - + # ========== Config and Model Loading ========== if config is None: config = AutoConfig.from_pretrained(model_name, trust_remote_code=True, padding=not is_molmo_model) config._attn_implementation = "eager" if (is_intern_model or is_molmo_model) else None config = set_num_layers(config, n_layer=n_layer) - + if is_intern_model: model_hf = AutoModelForCausalLM.from_pretrained( model_name, @@ -176,7 +176,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( processor = InternProcessor(model_hf, tokenizer) else: processor = AutoProcessor.from_pretrained(model_name, trust_remote_code=True, padding=True) - + if is_intern_model: prompt = [query] img_url_list = [img_url] @@ -202,7 +202,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( image = Image.open(requests.get(img_url, stream=True).raw) if model_name == "mistralai/Mistral-Small-3.1-24B-Instruct-2503": image = image.resize((1540, 1540)) - + # ========== Prepare Inputs and Get PyTorch HF Tokens ========== if is_intern_model: messages: List[List[str]] = [] @@ -274,14 +274,14 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( if "pixel_values" in inputs: inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) pytorch_hf_tokens = api_runner.run_vlm_hf_model_on_pytorch(model_hf, inputs) - + # pytorch_kv_tokens = api_runner.run_vlm_kv_model_on_pytorch(qeff_model.model) # assert (pytorch_kv_tokens == pytorch_hf_tokens).all(), ( # "Tokens don't match for pytorch HF output and pytorch KV output" # ) streamer = TextStreamer(processor.tokenizer) - + # ========== Export and Compile Model ========== if is_intern_model or is_molmo_model: qeff_model = QEFFAutoModelForCausalLM.from_pretrained( @@ -291,13 +291,13 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( ) else: qeff_model = QEFFAutoModelForImageTextToText(model_hf, kv_offload=kv_offload) - + qeff_model.export() # onnx_model_path = qeff_model.export() # ort_tokens = api_runner.run_vlm_kv_model_on_ort(onnx_model_path) # assert (pytorch_hf_tokens == ort_tokens).all(), "Tokens don't match for pytorch HF output and ORT output" - + compile_kwargs = { "num_devices": num_devices, "prefill_seq_len": prompt_len, @@ -306,16 +306,16 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( "enable_qnn": enable_qnn, "qnn_config": qnn_config, } - + if is_intern_model: compile_kwargs["num_patches"] = 1 elif not is_molmo_model and img_size is not None: compile_kwargs["img_size"] = img_size - + qeff_model.compile(**compile_kwargs) - + # ========== Generate and Verify Output ========== - + if not is_intern_model and not is_molmo_model: inputs = processor(images=image, text=prompt, return_tensors="pt") if hasattr(qeff_model.model.config, "model_type") and qeff_model.model.config.model_type == "qwen2_5_vl": @@ -324,7 +324,7 @@ def check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( ) if "pixel_values" in inputs: inputs["pixel_values"] = inputs["pixel_values"].to(torch.float32) - + print("QPC Outputs (QAIC):") output = qeff_model.generate(inputs=inputs, generation_len=NEW_GENERATION_TOKENS, streamer=streamer) qpc_tokens = output.generated_ids[:, :-1] @@ -342,13 +342,20 @@ def test_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100(model_name, kv_offload ``Mandatory`` Args: :model_name (str): Hugging Face Model Card name, Example: ``gpt2`` """ - if model_name in ["meta-llama/Llama-4-Scout-17B-16E-Instruct", "allenai/Molmo-7B-D-0924", "meta-llama/Llama-3.2-11B-Vision-Instruct"]: + if model_name in [ + "meta-llama/Llama-4-Scout-17B-16E-Instruct", + "allenai/Molmo-7B-D-0924", + "meta-llama/Llama-3.2-11B-Vision-Instruct", + ]: pytest.skip("Test skipped for this model due to some issues.") - if model_name in ["OpenGVLab/InternVL2_5-1B", "OpenGVLab/InternVL3_5-1B","Qwen/Qwen2.5-VL-3B-Instruct"] and not kv_offload: + if ( + model_name in ["OpenGVLab/InternVL2_5-1B", "OpenGVLab/InternVL3_5-1B", "Qwen/Qwen2.5-VL-3B-Instruct"] + and not kv_offload + ): pytest.skip("These models require kv_offload=True for testing.") # Get img_size for standard models, None for InternVL and Molmo img_size = model_config_dict[model_name].get("img_size") - + check_image_text_to_text_pytorch_vs_kv_vs_ort_vs_ai100( model_name=model_name, prompt_len=model_config_dict[model_name]["prompt_len"],