|
| 1 | +import shutil |
| 2 | + |
| 3 | +import pytest |
| 4 | +import torch |
| 5 | +from transformers import AutoConfig, AutoTokenizer, Llama4ForConditionalGeneration |
| 6 | +from transformers.models.gpt_oss.modeling_gpt_oss import GptOssForCausalLM |
| 7 | + |
| 8 | +from auto_round import AutoRound |
| 9 | + |
| 10 | + |
| 11 | +@pytest.fixture |
| 12 | +def setup_gpt_oss(): |
| 13 | + """Fixture to set up the GPT-OSS model and tokenizer.""" |
| 14 | + model_name = "/models/gpt-oss-20b-BF16" |
| 15 | + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
| 16 | + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) |
| 17 | + config.num_hidden_layers = 1 # Reduce layers for testing |
| 18 | + model = GptOssForCausalLM(config) |
| 19 | + output_dir = "test_quantized_gpt_oss" |
| 20 | + return model, tokenizer, output_dir, config |
| 21 | + |
| 22 | + |
| 23 | +@pytest.fixture |
| 24 | +def setup_llama4(): |
| 25 | + """Fixture to set up the llama4 model and tokenizer.""" |
| 26 | + model_name = "/dataset/Llama-4-Scout-17B-16E-Instruct" |
| 27 | + tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) |
| 28 | + config = AutoConfig.from_pretrained(model_name, trust_remote_code=True) |
| 29 | + config.vision_config.num_hidden_layers = 2 # Reduce layers for testing |
| 30 | + config.text_config.num_hidden_layers = 2 |
| 31 | + model = Llama4ForConditionalGeneration(config) |
| 32 | + output_dir = "test_quantized_llama4" |
| 33 | + return model, tokenizer, output_dir, config |
| 34 | + |
| 35 | + |
| 36 | +def quantize_model(model, tokenizer, output_dir, scheme, iters=0): |
| 37 | + """Helper function to quantize the model with the given scheme.""" |
| 38 | + autoround = AutoRound( |
| 39 | + model, |
| 40 | + tokenizer, |
| 41 | + scheme=scheme, |
| 42 | + nsamples=2, |
| 43 | + iters=iters, |
| 44 | + fp_layers="self_attn,router,lm_head,mlp.gate", |
| 45 | + ) |
| 46 | + quantized_model, save_folder = autoround.quantize_and_save(format="auto_round", output_dir=output_dir) |
| 47 | + return quantized_model |
| 48 | + |
| 49 | + |
| 50 | +def test_gptoss(setup_gpt_oss): |
| 51 | + model, tokenizer, output_dir, config = setup_gpt_oss |
| 52 | + |
| 53 | + # Below parameter is set to be same as the full model |
| 54 | + # Remove it to avoid mismatch during quantized model loading |
| 55 | + delattr(model.config, "layer_types") |
| 56 | + |
| 57 | + quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4") |
| 58 | + |
| 59 | + # Ensure the quantized model is not None |
| 60 | + assert quantized_model is not None, "Quantized model should not be None." |
| 61 | + |
| 62 | + loaded_model = GptOssForCausalLM.from_pretrained(output_dir) |
| 63 | + quantized_model.to("cuda") |
| 64 | + loaded_model.to("cuda") |
| 65 | + for n, m in quantized_model.named_modules(): |
| 66 | + if m.__class__.__name__ == "QuantLinear": |
| 67 | + loaded_m = loaded_model.get_submodule(n) |
| 68 | + assert (loaded_m.weight_packed == m.weight_packed).all() |
| 69 | + |
| 70 | + inp = torch.randint(0, 100, (1, 64)).to("cuda") |
| 71 | + with torch.inference_mode(): |
| 72 | + loaded_out = loaded_model(inp) |
| 73 | + |
| 74 | + # clean the output directory after test |
| 75 | + shutil.rmtree(output_dir, ignore_errors=True) |
| 76 | + |
| 77 | + |
| 78 | +def test_llama4(setup_llama4): |
| 79 | + model, tokenizer, output_dir, config = setup_llama4 |
| 80 | + |
| 81 | + # Below parameters are set to be same as the full model |
| 82 | + # Remove them to avoid mismatch during quantized model loading |
| 83 | + model.config.text_config.no_rope_layers = [] |
| 84 | + delattr(model.config.text_config, "moe_layers") |
| 85 | + delattr(model.config.text_config, "layer_types") |
| 86 | + |
| 87 | + quantized_model = quantize_model(model, tokenizer, output_dir, "MXFP4") |
| 88 | + |
| 89 | + # Ensure the quantized model is not None |
| 90 | + assert quantized_model is not None, "Quantized model should not be None." |
| 91 | + |
| 92 | + loaded_model = Llama4ForConditionalGeneration.from_pretrained(output_dir) |
| 93 | + quantized_model.to("cuda") |
| 94 | + loaded_model.to("cuda") |
| 95 | + for n, m in quantized_model.named_modules(): |
| 96 | + if m.__class__.__name__ == "QuantLinear": |
| 97 | + loaded_m = loaded_model.get_submodule(n) |
| 98 | + assert (loaded_m.weight_packed == m.weight_packed).all() |
| 99 | + |
| 100 | + inp = torch.randint(0, 100, (1, 64)).to("cuda") |
| 101 | + with torch.inference_mode(): |
| 102 | + loaded_out = loaded_model(inp) |
| 103 | + |
| 104 | + # clean the output directory after test |
| 105 | + shutil.rmtree(output_dir, ignore_errors=True) |
0 commit comments