File size: 3,198 Bytes
341d98d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
{
    "backbone": {
        "d_model": 2048,
        "d_intermediate": 0,
        "attn_mlp_d_intermediate": 8192,
        "n_layer": 46,
        "ssm_cfg": {
            "layer": "Mamba2"
        },
        "attn_layer_idx": [
            0,
            4,
            8,
            12,
            16,
            20,
            24,
            28,
            32,
            36,
            40,
            44
        ],
        "attn_cfg": {
            "causal": true,
            "num_heads": 16,
            "num_heads_kv": 4,
            "rotary_emb_dim": 128,
            "qkv_proj_bias": false,
            "out_proj_bias": false
        },
        "rms_norm": false,
        "residual_in_fp32": false,
        "norm_epsilon": 1e-05
    },
    "prefix_conditioner": {
        "conditioners": [
            {
                "type": "EspeakPhonemeConditioner",
                "name": "espeak"
            },
            {
                "cond_dim": 128,
                "uncond_type": "learned",
                "projection": "linear",
                "type": "PassthroughConditioner",
                "name": "speaker"
            },
            {
                "input_dim": 8,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "emotion"
            },
            {
                "min_val": 0,
                "max_val": 24000,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "fmax"
            },
            {
                "min_val": 0,
                "max_val": 400,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "pitch_std"
            },
            {
                "min_val": 0,
                "max_val": 40,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "speaking_rate"
            },
            {
                "min_val": -1,
                "max_val": 126,
                "uncond_type": "learned",
                "type": "IntegerConditioner",
                "name": "language_id"
            },
            {
                "input_dim": 8,
                "min_val": 0.5,
                "max_val": 0.8,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "vqscore_8"
            },
            {
                "min_val": -1.0,
                "max_val": 1000,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "ctc_loss"
            },
            {
                "min_val": 1,
                "max_val": 5,
                "uncond_type": "learned",
                "type": "FourierConditioner",
                "name": "dnsmos_ovrl"
            },
            {
                "min_val": 0,
                "max_val": 1,
                "uncond_type": "learned",
                "type": "IntegerConditioner",
                "name": "speaker_noised"
            }
        ],
        "projection": "linear"
    },
    "eos_token_id": 1024,
    "masked_token_id": 1025
}