Qwen3 4B 2507 Distillation
Overview
This notebook distills reasoning capabilities from frontier models into Qwen3-4B-2507, a compact but powerful 4 billion parameter model.
Choose Your Variant
The Thinking variant includes explicit chain-of-thought reasoning in <think> tags.
Base model: unsloth/Qwen3-4B-Thinking-2507
Chat template: qwen3-thinking
Best for: Complex reasoning, math, debugging, multi-step problems
The Instruct variant provides direct answers without explicit reasoning traces.
Base model: unsloth/Qwen3-4B-Instruct-2507
Chat template: qwen3-instruct
Best for: Simple tasks, conversational use, when speed matters
Open in Google Colab
Full Training Script
Copy this into a Colab notebook or run locally:
# Cell 1: Install dependencies!pip install unsloth datasets transformers trl
# Cell 2: Configurationimport osimport multiprocessing as mp
os.environ["TOKENIZERS_PARALLELISM"] = "false"os.environ["HF_DATASETS_DISABLE_MULTIPROCESSING"] = "1"
# ======= CONFIGURE THESE =======hf_account = "your-username" # Your HuggingFace usernamehf_token = "hf_..." # Your HuggingFace write tokenoutput_model_name = "Qwen3-4B-Thinking-2507-My-Distill"
# Choose your base model:# - "unsloth/Qwen3-4B-Thinking-2507" for thinking/CoT# - "unsloth/Qwen3-4B-Instruct-2507" for instruct/no-CoTinput_model = "unsloth/Qwen3-4B-Thinking-2507"
# Choose your chat template:# - "qwen3-thinking" for thinking models# - "qwen3-instruct" for instruct modelschat_template = "qwen3-thinking"
# Choose a dataset (or use your own JSONL file)dataset_id = "TeichAI/claude-4.5-opus-high-reasoning-250x"# dataset_file = "your-dataset.jsonl" # Uncomment to use local file
max_len = 8192steps = 2000# ================================
# Cell 3: Load modelfrom unsloth import FastLanguageModelfrom unsloth.chat_templates import get_chat_templateimport torch
model, tokenizer = FastLanguageModel.from_pretrained( model_name=input_model, max_seq_length=max_len, load_in_4bit=True, token=hf_token, attn_implementation="eager",)
# Cell 4: Apply LoRAmodel = FastLanguageModel.get_peft_model( model, r=32, target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", ], lora_alpha=32, lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", random_state=3407,)
tokenizer = get_chat_template(tokenizer, chat_template=chat_template)
# Cell 5: Load datasetfrom datasets import load_dataset
# Load from HuggingFace or local fileif 'dataset_id' in dir() and dataset_id: raw_dataset = load_dataset(dataset_id, split="train")else: raw_dataset = load_dataset("json", data_files=dataset_file, split="train")
def formatting_prompts_func(examples): convos = examples["messages"] texts = [ tokenizer.apply_chat_template( convo, tokenize=False, add_generation_prompt=False ) for convo in convos ] return {"text": texts}
train_dataset = raw_dataset.map(formatting_prompts_func, batched=True)print(f"Training on {len(train_dataset)} examples")
# Cell 6: Trainfrom trl import SFTTrainer, SFTConfig
if __name__ == "__main__": mp.freeze_support()
trainer = SFTTrainer( model=model, processing_class=tokenizer, train_dataset=train_dataset, args=SFTConfig( dataset_text_field="text", max_length=max_len, per_device_train_batch_size=1, gradient_accumulation_steps=4, warmup_ratio=0.05, max_steps=steps, learning_rate=2e-4, logging_steps=1, optim="adamw_8bit", weight_decay=0.01, lr_scheduler_type="linear", seed=3447, output_dir="outputs", save_strategy="steps", save_steps=200, ), )
# GPU info gpu_stats = torch.cuda.get_device_properties(0) print(f"GPU = {gpu_stats.name}. Max memory = {gpu_stats.total_memory / 1e9:.1f} GB")
# Set token IDs model.config.pad_token_id = tokenizer.pad_token_id model.config.eos_token_id = tokenizer.eos_token_id
# Train! trainer.train()
# Cell 7: Upload to HuggingFacemodel.push_to_hub_merged( f"{hf_account}/{output_model_name}", tokenizer, save_method="merged_16bit", token=hf_token,)print(f"✅ Uploaded merged model to {hf_account}/{output_model_name}")
# Cell 8: Create GGUF versionsmodel.push_to_hub_gguf( f"{hf_account}/{output_model_name}-GGUF", tokenizer, quantization_method=["bf16", "f16", "q8_0"], token=hf_token,)print(f"✅ Uploaded GGUF models to {hf_account}/{output_model_name}-GGUF")Example Output Models
These models were created using this exact process: