Skip to content

Qwen3 30B-A3B MoE Distillation

Overview

The Qwen3-30B-A3B is a Mixture-of-Experts (MoE) model with 30 billion total parameters but only ~3 billion active parameters per inference. This gives you good performance at a fraction of the compute cost.

Open in Google Colab

Open In Colab

Training Script

import os
import multiprocessing as mp
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"
# ======= CONFIGURE THESE =======
hf_account = "your-username"
hf_token = "hf_..."
output_model_name = "Qwen3-30B-A3B-Thinking-My-Distill"
# MoE model variants:
# - "unsloth/Qwen3-30B-A3B-Thinking-2507" for thinking/CoT
# - "unsloth/Qwen3-30B-A3B-Instruct-2507" for instruct
input_model = "unsloth/Qwen3-30B-A3B-Thinking-2507"
# Chat template matching model type:
# - "qwen3-thinking" for thinking models
# - "qwen3-instruct" for instruct models
chat_template = "qwen3-thinking"
dataset_id = "TeichAI/gemini-2.5-flash-11000x"
max_len = 8192
# ================================
from unsloth import FastModel
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
import torch
# Load MoE model (note: using FastModel for MoE)
model, tokenizer = FastModel.from_pretrained(
model_name=input_model,
max_seq_length=max_len,
load_in_4bit=True,
token=hf_token,
attn_implementation="eager",
)
# Apply LoRA
model = FastModel.get_peft_model(
model,
r=32,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha=32,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
)
tokenizer = get_chat_template(tokenizer, chat_template=chat_template)
# Load dataset
raw_dataset = load_dataset(dataset_id, split="train")
dataset_rows = len(raw_dataset)
steps = max(1000, int(2000 * (dataset_rows / 1000)))
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [
tokenizer.apply_chat_template(
convo, tokenize=False, add_generation_prompt=False
)
for convo in convos
]
return {"text": texts}
train_dataset = raw_dataset.map(formatting_prompts_func, batched=True)
print(f"Training on {len(train_dataset)} examples for {steps} steps")
# Train
if __name__ == "__main__":
mp.freeze_support()
trainer = SFTTrainer(
model=model,
processing_class=tokenizer,
train_dataset=train_dataset,
args=SFTConfig(
dataset_text_field="text",
max_length=max_len,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_ratio=0.05,
max_steps=steps,
learning_rate=2e-4,
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3447,
output_dir="outputs",
save_strategy="steps",
save_steps=100,
),
)
gpu_stats = torch.cuda.get_device_properties(0)
print(f"GPU = {gpu_stats.name}. Max memory = {gpu_stats.total_memory / 1e9:.1f} GB")
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
trainer.train()
# Upload
model.push_to_hub_merged(
f"{hf_account}/{output_model_name}",
tokenizer,
save_method="merged_16bit",
token=hf_token,
)
model.push_to_hub_gguf(
f"{hf_account}/{output_model_name}-GGUF",
tokenizer,
quantization_method=["bf16", "f16", "q8_0"],
token=hf_token,
)

Key Differences from Dense Models

Uses FastModel instead of FastLanguageModel for MoE architectures