Qwen3 30B-A3B MoE Distillation

Overview

The Qwen3-30B-A3B is a Mixture-of-Experts (MoE) model with 30 billion total parameters but only ~3 billion active parameters per inference. This gives you good performance at a fraction of the compute cost.

Open in Google Colab

Training Script

import os
import multiprocessing as mp

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"

# ======= CONFIGURE THESE =======
hf_account = "your-username"
hf_token = "hf_..."
output_model_name = "Qwen3-30B-A3B-Thinking-My-Distill"

# MoE model variants:
# - "unsloth/Qwen3-30B-A3B-Thinking-2507" for thinking/CoT
# - "unsloth/Qwen3-30B-A3B-Instruct-2507" for instruct
input_model = "unsloth/Qwen3-30B-A3B-Thinking-2507"

# Chat template matching model type:
# - "qwen3-thinking" for thinking models
# - "qwen3-instruct" for instruct models
chat_template = "qwen3-thinking"

dataset_id = "TeichAI/gemini-2.5-flash-11000x"
max_len = 8192
# ================================

from unsloth import FastModel
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
import torch

# Load MoE model (note: using FastModel for MoE)
model, tokenizer = FastModel.from_pretrained(
    model_name=input_model,
    max_seq_length=max_len,
    load_in_4bit=True,
    token=hf_token,
    attn_implementation="eager",
)

# Apply LoRA
model = FastModel.get_peft_model(
    model,
    r=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

tokenizer = get_chat_template(tokenizer, chat_template=chat_template)

# Load dataset
raw_dataset = load_dataset(dataset_id, split="train")
dataset_rows = len(raw_dataset)
steps = max(1000, int(2000 * (dataset_rows / 1000)))

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [
        tokenizer.apply_chat_template(
            convo, tokenize=False, add_generation_prompt=False
        )
        for convo in convos
    ]
    return {"text": texts}

train_dataset = raw_dataset.map(formatting_prompts_func, batched=True)
print(f"Training on {len(train_dataset)} examples for {steps} steps")

# Train
if __name__ == "__main__":
    mp.freeze_support()

    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=train_dataset,
        args=SFTConfig(
            dataset_text_field="text",
            max_length=max_len,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_ratio=0.05,
            max_steps=steps,
            learning_rate=2e-4,
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3447,
            output_dir="outputs",
            save_strategy="steps",
            save_steps=100,
        ),
    )

    gpu_stats = torch.cuda.get_device_properties(0)
    print(f"GPU = {gpu_stats.name}. Max memory = {gpu_stats.total_memory / 1e9:.1f} GB")

    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.eos_token_id = tokenizer.eos_token_id

    trainer.train()

# Upload
model.push_to_hub_merged(
    f"{hf_account}/{output_model_name}",
    tokenizer,
    save_method="merged_16bit",
    token=hf_token,
)

model.push_to_hub_gguf(
    f"{hf_account}/{output_model_name}-GGUF",
    tokenizer,
    quantization_method=["bf16", "f16", "q8_0"],
    token=hf_token,
)

Key Differences from Dense Models

Uses FastModel instead of FastLanguageModel for MoE architectures