|
---
|
|
base_model: []
|
|
library_name: transformers
|
|
tags:
|
|
- mergekit
|
|
- merge
|
|
|
|
---
|
|
# Qwen-R1-Dolphin-14B-exp40-3
|
|
|
|
This is a merge of pre-trained language models created using [mergekit](https://github.com/cg123/mergekit).
|
|
|
|
## Merge Details
|
|
### Merge Method
|
|
|
|
This model was merged using the passthrough merge method.
|
|
|
|
### Models Merged
|
|
|
|
The following models were included in the merge:
|
|
* D:/DeepSeek-R1-Distill-Qwen-14B
|
|
|
|
### Configuration
|
|
|
|
The following YAML configuration was used to produce this model:
|
|
|
|
```yaml
|
|
# Six splits plus "end game
|
|
# "D" starts at plus .1 VS D/O proj.
|
|
# 40 plus.
|
|
|
|
slices:
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [0, 47]
|
|
|
|
# conc layers
|
|
# split 1
|
|
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.01
|
|
- filter: down_proj
|
|
value: 0.01
|
|
- value: 0.11
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.02
|
|
- filter: down_proj
|
|
value: 0.02
|
|
- value: 0.12
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.03
|
|
- filter: down_proj
|
|
value: 0.03
|
|
- value: 0.13
|
|
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.04
|
|
- filter: down_proj
|
|
value: 0.04
|
|
- value: 0.61
|
|
|
|
# split 2, SURGE D THEN D drop .46, continues @ D .15 (from .13)
|
|
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.05
|
|
- filter: down_proj
|
|
value: 0.05
|
|
- value: 0.15
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.06
|
|
- filter: down_proj
|
|
value: 0.06
|
|
- value: 0.16
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.07
|
|
- filter: down_proj
|
|
value: 0.07
|
|
- value: 0.17
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.08
|
|
- filter: down_proj
|
|
value: 0.08
|
|
- value: 0.41
|
|
|
|
# split 3, SURGE D to .41, D drop .21 ... follows .17 previous
|
|
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.09
|
|
- filter: down_proj
|
|
value: 0.09
|
|
- value: 0.19
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.10
|
|
- filter: down_proj
|
|
value: 0.10
|
|
- value: 0.20
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.11
|
|
- filter: down_proj
|
|
value: 0.11
|
|
- value: .22
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.12
|
|
- filter: down_proj
|
|
value: 0.12
|
|
- value: .24
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.13
|
|
- filter: down_proj
|
|
value: 0.13
|
|
- value: .26
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.14
|
|
- filter: down_proj
|
|
value: 0.14
|
|
- value: .28
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.15
|
|
- filter: down_proj
|
|
value: 0.15
|
|
- value: .30
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.16
|
|
- filter: down_proj
|
|
value: 0.16
|
|
- value: .31
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.20
|
|
- filter: down_proj
|
|
value: 0.20
|
|
- value: .32
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.21
|
|
- filter: down_proj
|
|
value: 0.21
|
|
- value: .33
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.22
|
|
- filter: down_proj
|
|
value: 0.22
|
|
- value: .34
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.23
|
|
- filter: down_proj
|
|
value: 0.23
|
|
- value: .35
|
|
|
|
# split 4 , NO SURGE D, "D" down drop of .24 ; reverts to .11 (the very first "D" setting )
|
|
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.24
|
|
- filter: down_proj
|
|
value: 0.24
|
|
- value: 0.11
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.241
|
|
- filter: down_proj
|
|
value: 0.241
|
|
- value: 0.12
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.242
|
|
- filter: down_proj
|
|
value: 0.243
|
|
- value: 0.13
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.244
|
|
- filter: down_proj
|
|
value: 0.244
|
|
- value: 0.61
|
|
|
|
# split 5, D Surge to .61, drop to .15 (following .13)
|
|
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.245
|
|
- filter: down_proj
|
|
value: 0.245
|
|
- value: 0.15
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.246
|
|
- filter: down_proj
|
|
value: 0.246
|
|
- value: 0.16
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.247
|
|
- filter: down_proj
|
|
value: 0.247
|
|
- value: 0.17
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.248
|
|
- filter: down_proj
|
|
value: 0.248
|
|
- value: 0.41
|
|
|
|
# split 6, D surge to .41 , then follows .17
|
|
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.249
|
|
- filter: down_proj
|
|
value: 0.249
|
|
- value: 0.19
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.250
|
|
- filter: down_proj
|
|
value: 0.250
|
|
- value: 0.20
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.251
|
|
- filter: down_proj
|
|
value: 0.251
|
|
- value: .22
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.252
|
|
- filter: down_proj
|
|
value: 0.252
|
|
- value: .24
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.253
|
|
- filter: down_proj
|
|
value: 0.254
|
|
- value: .26
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.255
|
|
- filter: down_proj
|
|
value: 0.255
|
|
- value: .28
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.256
|
|
- filter: down_proj
|
|
value: 0.256
|
|
- value: .30
|
|
|
|
# O PROJ, DPROJ to .3333 /
|
|
# end game
|
|
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.3333333333333
|
|
- filter: down_proj
|
|
value: 0.3333333333333
|
|
- value: 0.3333333333333
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.4444444444444
|
|
- filter: down_proj
|
|
value: 0.4444444444444
|
|
- value: 0.4444444444444
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.5555555555555
|
|
- filter: down_proj
|
|
value: 0.5555555555555
|
|
- value: 0.5555555555555
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.6666666666666
|
|
- filter: down_proj
|
|
value: 0.6666666666666
|
|
- value: 0.6666666666666
|
|
- sources:
|
|
- model: D:/DeepSeek-R1-Distill-Qwen-14B
|
|
layer_range: [47,48]
|
|
parameters:
|
|
scale:
|
|
- filter: o_proj
|
|
value: 0.777777777777
|
|
- filter: down_proj
|
|
value: 0.777777777777
|
|
- value: 0.888888888888
|
|
merge_method: passthrough
|
|
dtype: bfloat16
|
|
```
|
|
|