Qwen3 30B-A3B MoE Distillation
Overview
The Qwen3-30B-A3B is a Mixture-of-Experts (MoE) model with 30 billion total parameters but only ~3 billion active parameters per inference. This gives you good performance at a fraction of the compute cost.
Open in Google Colab
Training Script
import osimport multiprocessing as mp
os.environ["TOKENIZERS_PARALLELISM"] = "false"os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"
# ======= CONFIGURE THESE =======hf_account = "your-username"hf_token = "hf_..."output_model_name = "Qwen3-30B-A3B-Thinking-My-Distill"
# MoE model variants:# - "unsloth/Qwen3-30B-A3B-Thinking-2507" for thinking/CoT# - "unsloth/Qwen3-30B-A3B-Instruct-2507" for instructinput_model = "unsloth/Qwen3-30B-A3B-Thinking-2507"
# Chat template matching model type:# - "qwen3-thinking" for thinking models# - "qwen3-instruct" for instruct modelschat_template = "qwen3-thinking"
dataset_id = "TeichAI/gemini-2.5-flash-11000x"max_len = 8192# ================================
from unsloth import FastModelfrom unsloth.chat_templates import get_chat_templatefrom datasets import load_datasetfrom trl import SFTTrainer, SFTConfigimport torch
# Load MoE model (note: using FastModel for MoE)model, tokenizer = FastModel.from_pretrained( model_name=input_model, max_seq_length=max_len, load_in_4bit=True, token=hf_token, attn_implementation="eager",)
# Apply LoRAmodel = FastModel.get_peft_model( model, r=32, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], lora_alpha=32, lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", random_state=3407,)
tokenizer = get_chat_template(tokenizer, chat_template=chat_template)
# Load datasetraw_dataset = load_dataset(dataset_id, split="train")dataset_rows = len(raw_dataset)steps = max(1000, int(2000 * (dataset_rows / 1000)))
def formatting_prompts_func(examples): convos = examples["messages"] texts = [ tokenizer.apply_chat_template( convo, tokenize=False, add_generation_prompt=False ) for convo in convos ] return {"text": texts}
train_dataset = raw_dataset.map(formatting_prompts_func, batched=True)print(f"Training on {len(train_dataset)} examples for {steps} steps")
# Trainif __name__ == "__main__": mp.freeze_support()
trainer = SFTTrainer( model=model, processing_class=tokenizer, train_dataset=train_dataset, args=SFTConfig( dataset_text_field="text", max_length=max_len, per_device_train_batch_size=1, gradient_accumulation_steps=4, warmup_ratio=0.05, max_steps=steps, learning_rate=2e-4, logging_steps=1, optim="adamw_8bit", weight_decay=0.01, lr_scheduler_type="linear", seed=3447, output_dir="outputs", save_strategy="steps", save_steps=100, ), )
gpu_stats = torch.cuda.get_device_properties(0) print(f"GPU = {gpu_stats.name}. Max memory = {gpu_stats.total_memory / 1e9:.1f} GB")
model.config.pad_token_id = tokenizer.pad_token_id model.config.eos_token_id = tokenizer.eos_token_id
trainer.train()
# Uploadmodel.push_to_hub_merged( f"{hf_account}/{output_model_name}", tokenizer, save_method="merged_16bit", token=hf_token,)
model.push_to_hub_gguf( f"{hf_account}/{output_model_name}-GGUF", tokenizer, quantization_method=["bf16", "f16", "q8_0"], token=hf_token,)Key Differences from Dense Models
Uses FastModel instead of FastLanguageModel for MoE architectures