Skip to content

Qwen3 4B 2507 Distillation

Overview

This notebook distills reasoning capabilities from frontier models into Qwen3-4B-2507, a compact but powerful 4 billion parameter model.

Choose Your Variant

The Thinking variant includes explicit chain-of-thought reasoning in <think> tags.

Base model: unsloth/Qwen3-4B-Thinking-2507 Chat template: qwen3-thinking

Best for: Complex reasoning, math, debugging, multi-step problems

Open in Google Colab

Open In Colab

Full Training Script

Copy this into a Colab notebook or run locally:

# Cell 1: Install dependencies
!pip install unsloth datasets transformers trl
# Cell 2: Configuration
import os
import multiprocessing as mp
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"
# ======= CONFIGURE THESE =======
hf_account = "your-username" # Your HuggingFace username
hf_token = "hf_..." # Your HuggingFace write token
output_model_name = "Qwen3-4B-Thinking-2507-My-Distill"
# Choose your base model:
# - "unsloth/Qwen3-4B-Thinking-2507" for thinking/CoT
# - "unsloth/Qwen3-4B-Instruct-2507" for instruct/no-CoT
input_model = "unsloth/Qwen3-4B-Thinking-2507"
# Choose your chat template:
# - "qwen3-thinking" for thinking models
# - "qwen3-instruct" for instruct models
chat_template = "qwen3-thinking"
# Choose a dataset (or use your own JSONL file)
dataset_id = "TeichAI/claude-4.5-opus-high-reasoning-250x"
# dataset_file = "your-dataset.jsonl" # Uncomment to use local file
max_len = 8192
steps = 2000
# ================================
# Cell 3: Load model
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
import torch
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=input_model,
max_seq_length=max_len,
load_in_4bit=True,
token=hf_token,
attn_implementation="eager",
)
# Cell 4: Apply LoRA
model = FastLanguageModel.get_peft_model(
model,
r=32,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha=32,
lora_dropout=0,
bias="none",
use_gradient_checkpointing="unsloth",
random_state=3407,
)
tokenizer = get_chat_template(tokenizer, chat_template=chat_template)
# Cell 5: Load dataset
from datasets import load_dataset
# Load from HuggingFace or local file
if 'dataset_id' in dir() and dataset_id:
raw_dataset = load_dataset(dataset_id, split="train")
else:
raw_dataset = load_dataset("json", data_files=dataset_file, split="train")
def formatting_prompts_func(examples):
convos = examples["messages"]
texts = [
tokenizer.apply_chat_template(
convo, tokenize=False, add_generation_prompt=False
)
for convo in convos
]
return {"text": texts}
train_dataset = raw_dataset.map(formatting_prompts_func, batched=True)
print(f"Training on {len(train_dataset)} examples")
# Cell 6: Train
from trl import SFTTrainer, SFTConfig
if __name__ == "__main__":
mp.freeze_support()
trainer = SFTTrainer(
model=model,
processing_class=tokenizer,
train_dataset=train_dataset,
args=SFTConfig(
dataset_text_field="text",
max_length=max_len,
per_device_train_batch_size=1,
gradient_accumulation_steps=4,
warmup_ratio=0.05,
max_steps=steps,
learning_rate=2e-4,
logging_steps=1,
optim="adamw_8bit",
weight_decay=0.01,
lr_scheduler_type="linear",
seed=3447,
output_dir="outputs",
save_strategy="steps",
save_steps=200,
),
)
# GPU info
gpu_stats = torch.cuda.get_device_properties(0)
print(f"GPU = {gpu_stats.name}. Max memory = {gpu_stats.total_memory / 1e9:.1f} GB")
# Set token IDs
model.config.pad_token_id = tokenizer.pad_token_id
model.config.eos_token_id = tokenizer.eos_token_id
# Train!
trainer.train()
# Cell 7: Upload to HuggingFace
model.push_to_hub_merged(
f"{hf_account}/{output_model_name}",
tokenizer,
save_method="merged_16bit",
token=hf_token,
)
print(f"✅ Uploaded merged model to {hf_account}/{output_model_name}")
# Cell 8: Create GGUF versions
model.push_to_hub_gguf(
f"{hf_account}/{output_model_name}-GGUF",
tokenizer,
quantization_method=["bf16", "f16", "q8_0"],
token=hf_token,
)
print(f"✅ Uploaded GGUF models to {hf_account}/{output_model_name}-GGUF")

Example Output Models

These models were created using this exact process: