Qwen3 8B Distillation
Overview
The Qwen3-8B and Qwen3-14B models offer a significant step up in capability from the 4B variant while still being trainable on consumer hardware.
Open in Google Colab
Configuration Changes
The 8B training script is nearly identical to the 4B version. Key differences:
# Use 8B base modelinput_model = "unsloth/Qwen3-8B" # or unsloth/Qwen3-14B
# Use the standard qwen3 chat templatechat_template = "qwen3"
# Reduce batch size if needed for memoryper_device_train_batch_size = 1gradient_accumulation_steps = 4
# Optional larger context length, if you have enough VRAMmax_len = 8192 # Can reduce to 4096 or 2048 if OOMFull Training Script
import osimport multiprocessing as mp
os.environ["TOKENIZERS_PARALLELISM"] = "false"os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"
# ======= CONFIGURE THESE =======hf_account = "your-username"hf_token = "hf_..."output_model_name = "Qwen3-8B-My-Distill"input_model = "unsloth/Qwen3-8B" # or unsloth/Qwen3-14Bchat_template = "qwen3"dataset_id = "TeichAI/deepseek-v3.2-speciale-1000x"max_len = 8192steps = 2000# ================================
from unsloth import FastLanguageModelfrom unsloth.chat_templates import get_chat_templatefrom datasets import load_datasetfrom trl import SFTTrainer, SFTConfigimport torch
# Load modelmodel, tokenizer = FastLanguageModel.from_pretrained( model_name=input_model, max_seq_length=max_len, load_in_4bit=True, token=hf_token, attn_implementation="eager",)
# Apply LoRAmodel = FastLanguageModel.get_peft_model( model, r=32, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], lora_alpha=32, lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", random_state=3407,)
tokenizer = get_chat_template(tokenizer, chat_template=chat_template)
# Load datasetraw_dataset = load_dataset(dataset_id, split="train")
def formatting_prompts_func(examples): convos = examples["messages"] texts = [ tokenizer.apply_chat_template( convo, tokenize=False, add_generation_prompt=False ) for convo in convos ] return {"text": texts}
train_dataset = raw_dataset.map(formatting_prompts_func, batched=True)
# Trainif __name__ == "__main__": mp.freeze_support()
trainer = SFTTrainer( model=model, processing_class=tokenizer, train_dataset=train_dataset, args=SFTConfig( dataset_text_field="text", max_length=max_len, per_device_train_batch_size=1, gradient_accumulation_steps=4, warmup_ratio=0.05, max_steps=steps, learning_rate=2e-4, logging_steps=1, optim="adamw_8bit", weight_decay=0.01, lr_scheduler_type="linear", seed=3447, output_dir="outputs", ), )
model.config.pad_token_id = tokenizer.pad_token_id model.config.eos_token_id = tokenizer.eos_token_id
trainer.train()
# Uploadmodel.push_to_hub_merged( f"{hf_account}/{output_model_name}", tokenizer, save_method="merged_16bit", token=hf_token,)
model.push_to_hub_gguf( f"{hf_account}/{output_model_name}-GGUF", tokenizer, quantization_method=["bf16", "f16", "q8_0"], token=hf_token,)Memory Optimization Tips
If you encounter OOM (Out of Memory) errors:
- Reduce
max_seq_lengthto 4096 or 2048 - Enable gradient checkpointing (already enabled with
"unsloth") - Reduce LoRA rank from 32 to 16
- Use 8-bit instead of 4-bit (slightly more accurate, uses more memory)
# Memory-saving configurationmodel, tokenizer = FastLanguageModel.from_pretrained( model_name=input_model, max_seq_length=4096, # Reduced from 8192 load_in_4bit=True, token=hf_token,)
model = FastLanguageModel.get_peft_model( model, r=16, # Reduced from 32 # ... rest of config)