Qwen3 4B 2507 Distillation

Overview

This notebook distills reasoning capabilities from frontier models into Qwen3-4B-2507, a compact but powerful 4 billion parameter model.

Choose Your Variant

Thinking (CoT)
Instruct (No CoT)

The Thinking variant includes explicit chain-of-thought reasoning in <think> tags.

Base model: unsloth/Qwen3-4B-Thinking-2507 Chat template: qwen3-thinking

Best for: Complex reasoning, math, debugging, multi-step problems

The Instruct variant provides direct answers without explicit reasoning traces.

Base model: unsloth/Qwen3-4B-Instruct-2507 Chat template: qwen3-instruct

Best for: Simple tasks, conversational use, when speed matters

Open in Google Colab

Full Training Script

Copy this into a Colab notebook or run locally:

# Cell 1: Install dependencies
!pip install unsloth datasets transformers trl

# Cell 2: Configuration
import os
import multiprocessing as mp

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"

# ======= CONFIGURE THESE =======
hf_account = "your-username"           # Your HuggingFace username
hf_token = "hf_..."                    # Your HuggingFace write token
output_model_name = "Qwen3-4B-Thinking-2507-My-Distill"

# Choose your base model:
# - "unsloth/Qwen3-4B-Thinking-2507" for thinking/CoT
# - "unsloth/Qwen3-4B-Instruct-2507" for instruct/no-CoT
input_model = "unsloth/Qwen3-4B-Thinking-2507"

# Choose your chat template:
# - "qwen3-thinking" for thinking models
# - "qwen3-instruct" for instruct models
chat_template = "qwen3-thinking"

# Choose a dataset (or use your own JSONL file)
dataset_id = "TeichAI/claude-4.5-opus-high-reasoning-250x"
# dataset_file = "your-dataset.jsonl"  # Uncomment to use local file

max_len = 8192
steps = 2000
# ================================

# Cell 3: Load model
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=input_model,
    max_seq_length=max_len,
    load_in_4bit=True,
    token=hf_token,
    attn_implementation="eager",
)

# Cell 4: Apply LoRA
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha=32,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=3407,
)

tokenizer = get_chat_template(tokenizer, chat_template=chat_template)

# Cell 5: Load dataset
from datasets import load_dataset

# Load from HuggingFace or local file
if 'dataset_id' in dir() and dataset_id:
    raw_dataset = load_dataset(dataset_id, split="train")
else:
    raw_dataset = load_dataset("json", data_files=dataset_file, split="train")

def formatting_prompts_func(examples):
    convos = examples["messages"]
    texts = [
        tokenizer.apply_chat_template(
            convo, tokenize=False, add_generation_prompt=False
        )
        for convo in convos
    ]
    return {"text": texts}

train_dataset = raw_dataset.map(formatting_prompts_func, batched=True)
print(f"Training on {len(train_dataset)} examples")

# Cell 6: Train
from trl import SFTTrainer, SFTConfig

if __name__ == "__main__":
    mp.freeze_support()

    trainer = SFTTrainer(
        model=model,
        processing_class=tokenizer,
        train_dataset=train_dataset,
        args=SFTConfig(
            dataset_text_field="text",
            max_length=max_len,
            per_device_train_batch_size=1,
            gradient_accumulation_steps=4,
            warmup_ratio=0.05,
            max_steps=steps,
            learning_rate=2e-4,
            logging_steps=1,
            optim="adamw_8bit",
            weight_decay=0.01,
            lr_scheduler_type="linear",
            seed=3447,
            output_dir="outputs",
            save_strategy="steps",
            save_steps=200,
        ),
    )

    # GPU info
    gpu_stats = torch.cuda.get_device_properties(0)
    print(f"GPU = {gpu_stats.name}. Max memory = {gpu_stats.total_memory / 1e9:.1f} GB")

    # Set token IDs
    model.config.pad_token_id = tokenizer.pad_token_id
    model.config.eos_token_id = tokenizer.eos_token_id

    # Train!
    trainer.train()

# Cell 7: Upload to HuggingFace
model.push_to_hub_merged(
    f"{hf_account}/{output_model_name}",
    tokenizer,
    save_method="merged_16bit",
    token=hf_token,
)
print(f"✅ Uploaded merged model to {hf_account}/{output_model_name}")

# Cell 8: Create GGUF versions
model.push_to_hub_gguf(
    f"{hf_account}/{output_model_name}-GGUF",
    tokenizer,
    quantization_method=["bf16", "f16", "q8_0"],
    token=hf_token,
)
print(f"✅ Uploaded GGUF models to {hf_account}/{output_model_name}-GGUF")

Example Output Models

These models were created using this exact process: