--- base_model: - Qwen/Qwen3-VL-30B-A3B-Instruct --- How to quant: git clone https://github.com/vllm-project/llm-compressor.git cd llm-compressor pip install -e . pip install transformers==4.57.0 export TOKENIZERS_PARALLELISM=false cd .. Script For quant: Save the next code like quant.py. python3 quant.py ``` import torch from datasets import load_dataset from transformers import AutoProcessor, Qwen3VLMoeForConditionalGeneration from llmcompressor import oneshot from llmcompressor.modeling import replace_modules_for_calibration from llmcompressor.modifiers.awq import AWQModifier from llmcompressor.utils import dispatch_for_generation # NOTE: Requires a minimum of transformers 4.57.0 MODEL_ID = "Qwen/Qwen3-VL-30B-A3B-Instruct" # Load model. model = Qwen3VLMoeForConditionalGeneration.from_pretrained( MODEL_ID, torch_dtype=torch.bfloat16, device_map=None, trust_remote_code=True ) processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True) model = replace_modules_for_calibration(model) DATASET_ID = "neuralmagic/calibration" NUM_CALIBRATION_SAMPLES = 256 MAX_SEQUENCE_LENGTH = 8192 ds = load_dataset(DATASET_ID, name="LLM", split=f"train[:{NUM_CALIBRATION_SAMPLES}]") ds = ds.shuffle(seed=42) def preprocess_function(example): messages = [] for message in example["messages"]: messages.append( { "role": message["role"], "content": [{"type": "text", "text": message["content"]}], } ) return processor.apply_chat_template( messages, return_tensors="pt", padding=False, truncation=True, max_length=MAX_SEQUENCE_LENGTH, tokenize=True, add_special_tokens=False, return_dict=True, add_generation_prompt=False, ) ds = ds.map(preprocess_function, batched=False, remove_columns=ds.column_names) def data_collator(batch): assert len(batch) == 1 return { key: ( torch.tensor(value) if key != "pixel_values" else torch.tensor(value, dtype=torch.bfloat16).squeeze(0) ) for key, value in batch[0].items() } # Configure AWQ quantization with smoothing and balancing recipe = AWQModifier( ignore=[ 're:.*embed_tokens', 're:.*input_layernorm$', 're:.*mlp[.]gate$', 're:.*post_attention_layernorm$', 're:.*norm$', 're:model[.]visual.*', 're:visual.*', 'lm_head' ], mappings=[ { "smooth_layer": "re:.*input_layernorm$", "balance_layers": ['re:.*q_proj$', 're:.*k_proj$', 're:.*v_proj$'] }, { "smooth_layer": "re:.*v_proj$", "balance_layers": ['re:.*o_proj$'] }, { "smooth_layer": "re:.*post_attention_layernorm$", "balance_layers": ['re:.*gate_proj$', 're:.*up_proj$'] }, { "smooth_layer": "re:.*up_proj$", "balance_layers": ['re:.*down_proj$'] } ], duo_scaling=True, config_groups={ "group_0": { "targets": ["Linear"], "weights": { "num_bits": 8, "type": "int", "symmetric": True, "group_size": 32, "strategy": "group", "block_structure": None, "dynamic": False, "actorder": None, "observer": "mse", "observer_kwargs": {} }, "input_activations": None, "output_activations": None, "format": None } } ) # Apply AWQ quantization. oneshot( model=model, processor=processor, recipe=recipe, dataset=ds, max_seq_length=MAX_SEQUENCE_LENGTH, num_calibration_samples=NUM_CALIBRATION_SAMPLES, data_collator=data_collator, ) print("========== SAMPLE GENERATION ==============") dispatch_for_generation(model) input_ids = processor(text="Hello my name is", return_tensors="pt").input_ids.to("cuda") output = model.generate(input_ids, max_new_tokens=20) print(processor.decode(output[0])) print("==========================================") # Save to disk in compressed-tensors format. SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-AWQ-W8A16-mse-seq" model.save_pretrained(SAVE_DIR, save_compressed=True) processor.save_pretrained(SAVE_DIR) ```