{ | |
"bias": false, | |
"capacity_factor": 0.12, | |
"d_model": 1024, | |
"dropout": 0.2, | |
"ffn": "swiglu", | |
"hidden_dim": 4096, | |
"mixture_of_depth": true, | |
"mixture_of_expert": false, | |
"model_type": { | |
"mixture_of_depth": true, | |
"name": "mixture of depth" | |
}, | |
"moe_num_experts": 4, | |
"moe_num_experts_per_tok": 2, | |
"multiple_of": 4, | |
"num_heads": 16, | |
"num_kv_heads": 0, | |
"num_layers": 16, | |
"seq_len": 512, | |
"vocab_size": 50257, | |
"weight_tying": true, | |
"window_size": 128 | |
} |