Qwen2.5-14B-Ultima / mergekit_config.yml

Upload folder using huggingface_hub

fbcc099 verified 3 days ago

4.29 kB

	# =============================================================================
	# SuperMerge-14B-Simple
	#
	# This configuration merges only two components:
	# - Base Model: Provides stable foundational features.
	# Model: sometimesanotion/Lamarck-14B-v0.7-rc4
	#
	# - Reasoning Module: Drives enhanced mid-layer reasoning.
	# Model: sthenno/tempesthenno-ppo-ckpt40
	#
	# The merge is performed using slerp with a V-shaped interpolation curve.
	# Weighting across each 8-layer slice is tuned to balance core feature
	# preservation with advanced reasoning.
	# =============================================================================
	name: SuperMerge-14B-Simple
	merge_method: slerp
	base_model: sometimesanotion/Lamarck-14B-v0.7-rc4
	tokenizer_source: base
	dtype: float32
	out_dtype: bfloat16
	parameters:
	int8_mask: true # Optimize memory usage.
	normalize: true # Ensure weights are on a comparable scale.
	rescale: false # No additional rescaling necessary.
	# Interpolation curve for 6 slices (48 layers total):
	# Maintains a V-shaped emphasis for mid-layer processing.
	t: [0.1, 0.35, 0.85, 0.85, 0.35, 0.1]
	slices:
	# ---------------------------------------------------------------------------
	# Slice 1 (Layers 0-8):
	# - Early layers: nearly pure base model with minimal PPO influence.
	# ---------------------------------------------------------------------------
	- sources:
	- model: sometimesanotion/Lamarck-14B-v0.7-rc4
	layer_range: [0, 8]
	parameters:
	weight: 0.95
	- model: sthenno/tempesthenno-ppo-ckpt40
	layer_range: [0, 8]
	parameters:
	weight: 0.05

	# ---------------------------------------------------------------------------
	# Slice 2 (Layers 8-16):
	# - Blend base with stronger PPO contributions to boost reasoning.
	# ---------------------------------------------------------------------------
	- sources:
	- model: sometimesanotion/Lamarck-14B-v0.7-rc4
	layer_range: [8, 16]
	parameters:
	weight: 0.4
	- model: sthenno/tempesthenno-ppo-ckpt40
	layer_range: [8, 16]
	parameters:
	weight: 0.6

	# ---------------------------------------------------------------------------
	# Slice 3 (Layers 16-24):
	# - Mid-layer: Prioritize advanced reasoning by increasing the PPO share.
	# ---------------------------------------------------------------------------
	- sources:
	- model: sometimesanotion/Lamarck-14B-v0.7-rc4
	layer_range: [16, 24]
	parameters:
	weight: 0.3
	- model: sthenno/tempesthenno-ppo-ckpt40
	layer_range: [16, 24]
	parameters:
	weight: 0.7

	# ---------------------------------------------------------------------------
	# Slice 4 (Layers 24-32):
	# - Continue the focus on reasoning with PPO while still retaining base traits.
	# ---------------------------------------------------------------------------
	- sources:
	- model: sometimesanotion/Lamarck-14B-v0.7-rc4
	layer_range: [24, 32]
	parameters:
	weight: 0.35
	- model: sthenno/tempesthenno-ppo-ckpt40
	layer_range: [24, 32]
	parameters:
	weight: 0.65

	# ---------------------------------------------------------------------------
	# Slice 5 (Layers 32-40):
	# - Re-stabilize the network with a stronger base model contribution.
	# ---------------------------------------------------------------------------
	- sources:
	- model: sometimesanotion/Lamarck-14B-v0.7-rc4
	layer_range: [32, 40]
	parameters:
	weight: 0.6
	- model: sthenno/tempesthenno-ppo-ckpt40
	layer_range: [32, 40]
	parameters:
	weight: 0.4

	# ---------------------------------------------------------------------------
	# Slice 6 (Layers 40-48):
	# - Final output layers: Maintain fluency with the base model augmented by PPO.
	# ---------------------------------------------------------------------------
	- sources:
	- model: sometimesanotion/Lamarck-14B-v0.7-rc4
	layer_range: [40, 48]
	parameters:
	weight: 0.6
	- model: sthenno/tempesthenno-ppo-ckpt40
	layer_range: [40, 48]
	parameters:
	weight: 0.4