metascroy
/

Ministral-3-3B-Instruct-2512-int8-int4-unsloth

@@ -21,3 +21,186 @@ language:
 This mistral3 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
 [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)

 This mistral3 model was trained 2x faster with [Unsloth](https://github.com/unslothai/unsloth) and Huggingface's TRL library.
 [<img src="https://raw.githubusercontent.com/unslothai/unsloth/main/images/unsloth%20made%20with%20love.png" width="200"/>](https://github.com/unslothai/unsloth)
+```python
+################################################################################
+# We first load the model for QAT using the mobile CPU friendly int8-int4 scheme
+################################################################################
+from unsloth import FastVisionModel
+from unsloth.chat_templates import (
+    get_chat_template,
+)
+import torch
+MODEL_ID = "unsloth/Ministral-3-3B-Instruct-2512"
+QAT_SCHEME = "int8-int4"
+model, tokenizer = FastVisionModel.from_pretrained(
+    model_name = MODEL_ID,
+    max_seq_length = 2048,
+    dtype = torch.bfloat16,
+    load_in_4bit = False,
+    full_finetuning = True,
+    # ExecuTorch CPU quantization scheme
+    # Quantize embedding to 8-bits, and quantize linear layers to 4-bits
+    # with 8-bit dynamically quantized activations
+    qat_scheme = QAT_SCHEME,
+)
+print(model)
+################################################################################
+# Data prep
+################################################################################
+from datasets import load_dataset
+dataset = load_dataset("unsloth/LaTeX_OCR", split = "train")
+# Convert the dataset into a conversational format
+instruction = "Write the LaTeX representation for this image."
+def convert_to_conversation(sample):
+    conversation = [
+        { "role": "user",
+          "content" : [
+            {"type" : "text",  "text"  : instruction},
+            {"type" : "image", "image" : sample["image"]} ]
+        },
+        { "role" : "assistant",
+          "content" : [
+            {"type" : "text",  "text"  : sample["text"]} ]
+        },
+    ]
+    return { "messages" : conversation }
+converted_dataset = [convert_to_conversation(sample) for sample in dataset]
+print(converted_dataset[0])
+################################################################################
+# Before finetuning
+################################################################################
+FastVisionModel.for_inference(model) # Enable for inference!
+image = dataset[2]["image"]
+instruction = "Write the LaTeX representation for this image."
+messages = [
+    {"role": "user", "content": [
+        {"type": "image"},
+        {"type": "text", "text": instruction}
+    ]}
+]
+input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
+inputs = tokenizer(
+    image,
+    input_text,
+    add_special_tokens = False,
+    return_tensors = "pt",
+).to("cuda")
+from transformers import TextStreamer
+text_streamer = TextStreamer(tokenizer, skip_prompt = True)
+_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 64,
+                   use_cache = True, temperature = 1.5, min_p = 0.1)
+################################################################################
+# Define trainer
+################################################################################
+from unsloth.trainer import UnslothVisionDataCollator
+from trl import SFTTrainer, SFTConfig
+from unsloth import is_bf16_supported
+trainer = SFTTrainer(
+    model = model,
+    tokenizer = tokenizer,
+    data_collator = UnslothVisionDataCollator(model, tokenizer), # Must use!
+    train_dataset = converted_dataset,
+    args = SFTConfig(
+        per_device_train_batch_size = 4,
+        gradient_accumulation_steps = 2,
+        warmup_steps = 5,
+        max_steps = 30,
+        # num_train_epochs = 1, # Set this instead of max_steps for full training runs
+        learning_rate = 3e-5,
+        logging_steps = 1,
+        optim = "adamw_8bit",
+        fp16 = not is_bf16_supported(), # Use fp16 if bf16 is not supported
+        bf16 = is_bf16_supported(), # Use bf16 if supported
+        weight_decay = 0.001,
+        lr_scheduler_type = "linear",
+        seed = 3407,
+        output_dir = "outputs",
+        report_to = "none",
+        # You MUST put the below items for vision finetuning:
+        remove_unused_columns = False,
+        dataset_text_field = "",
+        dataset_kwargs = {"skip_prepare_dataset": True},
+        max_length = 2048,
+    ),
+)
+################################################################################
+# Do fine tuning
+################################################################################
+trainer_stats = trainer.train()
+################################################################################
+# Inference after finetuning
+################################################################################
+FastVisionModel.for_inference(model) # Enable for inference!
+image = dataset[2]["image"]
+instruction = "Write the LaTeX representation for this image."
+messages = [
+    {"role": "user", "content": [
+        {"type": "image"},
+        {"type": "text", "text": instruction}
+    ]}
+]
+input_text = tokenizer.apply_chat_template(messages, add_generation_prompt = True)
+inputs = tokenizer(
+    image,
+    input_text,
+    add_special_tokens = False,
+    return_tensors = "pt",
+).to("cuda")
+from transformers import TextStreamer
+text_streamer = TextStreamer(tokenizer, skip_prompt = True)
+_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128,
+                   use_cache = True, temperature = 1.5, min_p = 0.1)
+# ################################################################################
+# # Convert model to torchao format and save
+# ################################################################################
+from unsloth.models._utils import _convert_torchao_model
+_convert_torchao_model(model)
+model_name = MODEL_ID.split("/")[-1]
+save_to = f"{model_name}-{QAT_SCHEME}-unsloth"
+# Save locally
+model.save_pretrained(save_to, safe_serialization=False)
+tokenizer.save_pretrained(save_to)
+# Or save to hub
+from huggingface_hub import get_token, whoami
+def _get_username():
+    token = get_token()
+    username = whoami(token=token)["name"]
+    return username
+username = _get_username()
+model.push_to_hub(f"{username}/{save_to}", safe_serialization=False)
+tokenizer.push_to_hub(f"{username}/{save_to}")
+```