Qwen3 8B Distillation

Overview

The Qwen3-8B and Qwen3-14B models offer a significant step up in capability from the 4B variant while still being trainable on consumer hardware.

Open in Google Colab

Configuration Changes

The 8B training script is nearly identical to the 4B version. Key differences:

# Use 8B base model
input_model = "unsloth/Qwen3-8B"  # or unsloth/Qwen3-14B

# Use the standard qwen3 chat template
chat_template = "qwen3"

# Reduce batch size if needed for memory
per_device_train_batch_size = 1
gradient_accumulation_steps = 4

# Optional larger context length, if you have enough VRAM
max_len = 8192  # Can reduce to 4096 or 2048 if OOM

Full Training Script

import os
import multiprocessing as mp

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"

# ======= CONFIGURE THESE =======
hf_account = "your-username"
hf_token = "hf_..."
output_model_name = "Qwen3-8B-My-Distill"
input_model = "unsloth/Qwen3-8B"  # or unsloth/Qwen3-14B
chat_template = "qwen3"
dataset_id = "TeichAI/deepseek-v3.2-speciale-1000x"
max_len = 8192
steps = 2000
# ================================

from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
import torch

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=input_model,
    max_seq_length=max_len,
    load_in_4bit=True,
    token=hf_token,
    attn_implementation="eager",
)

# Apply LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

tokenizer = get_chat_template(tokenizer, chat_template=chat_template)

# Load dataset
raw_dataset = load_dataset(dataset_id, split="train")

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [
        tokenizer.apply_chat_template(
            convo, tokenize=False, add_generation_prompt=False
        )
        for convo in convos
    ]
    return {"text": texts}

train_dataset = raw_dataset.map(formatting_prompts_func, batched=True)

# Train
if __name__ == "__main__":
    mp.freeze_support()

    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=train_dataset,
        args=SFTConfig(
            dataset_text_field="text",
            max_length=max_len,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_ratio=0.05,
            max_steps=steps,
            learning_rate=2e-4,
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3447,
            output_dir="outputs",
        ),
    )

    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.eos_token_id = tokenizer.eos_token_id

    trainer.train()

# Upload
model.push_to_hub_merged(
    f"{hf_account}/{output_model_name}",
    tokenizer,
    save_method="merged_16bit",
    token=hf_token,
)

model.push_to_hub_gguf(
    f"{hf_account}/{output_model_name}-GGUF",
    tokenizer,
    quantization_method=["bf16", "f16", "q8_0"],
    token=hf_token,
)

Memory Optimization Tips

If you encounter OOM (Out of Memory) errors:

Reduce max_seq_length to 4096 or 2048
Enable gradient checkpointing (already enabled with "unsloth")
Reduce LoRA rank from 32 to 16
Use 8-bit instead of 4-bit (slightly more accurate, uses more memory)

# Memory-saving configuration
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=input_model,
    max_seq_length=4096,  # Reduced from 8192
    load_in_4bit=True,
    token=hf_token,
)

model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Reduced from 32
    # ... rest of config
)