File size: 5,641 Bytes
d075f56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# set random seed, so that you may reproduce your result.
__set_seed1: !apply:random.seed [1988]
__set_seed2: !apply:numpy.random.seed [1988]
__set_seed3: !apply:torch.manual_seed [1988]
__set_seed4: !apply:torch.cuda.manual_seed_all [1988]

# fixed params
sample_rate: 24000
text_encoder_input_size: 512
llm_input_size: 1536
llm_output_size: 1536

basemodel_path: '../../pretrained_models/InspireMusic-1.5B-Long/'
generator_path: '../../pretrained_models/InspireMusic-1.5B-Long/music_tokenizer'

# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
llm: !new:inspiremusic.llm.llm.LLM
    text_encoder_input_size: !ref <text_encoder_input_size>
    llm_input_size: !ref <llm_input_size>
    llm_output_size: !ref <llm_output_size>
    audio_token_size: 4096
    length_normalized_loss: True
    lsm_weight: 0
    text_encoder_conf:
        name: "none"
    llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
        input_size: !ref <text_encoder_input_size>
        pretrain_path: !ref <basemodel_path>
        
    sampling: !name:inspiremusic.utils.common.topk_sampling
        top_k: 350
    train_cfg_ratio: 0.2
    infer_cfg_ratio: 3.0
flow: !new:inspiremusic.flow.flow.MaskedDiff
    input_size: 256
    output_size: 80
    output_type: 'mel'
    vocab_size: 4096
    input_frame_rate: 75
    only_mask_loss: True
    encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
        output_size: 512
        attention_heads: 4
        linear_units: 1024
        num_blocks: 3
        dropout_rate: 0.1
        positional_dropout_rate: 0.1
        attention_dropout_rate: 0.1
        normalize_before: True
        input_layer: 'linear'
        pos_enc_layer_type: 'rel_pos_espnet'
        selfattention_layer_type: 'rel_selfattn'
        input_size: 256
        use_cnn_module: False
        macaron_style: False
    length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
        channels: 512
        sampling_ratios: [1, 1, 1, 1]
    decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
        in_channels: 240
        cfm_params: !new:omegaconf.DictConfig
            content:
                sigma_min: 1e-06
                solver: 'euler'
                t_scheduler: 'cosine'
                training_cfg_rate: 0.2
                inference_cfg_rate: 0.7
                reg_loss_type: 'l1'
        estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
            in_channels: 1024
            out_channels: 512
            channels: [256, 256]
            dropout: 0.0
            attention_head_dim: 64
            n_blocks: 4
            num_mid_blocks: 8
            num_heads: 8
            act_fn: 'gelu'
    generator_model_dir: !ref <generator_path>

hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
    in_channels: 80
    base_channels: 512
    nb_harmonics: 8
    sampling_rate: !ref <sample_rate>
    nsf_alpha: 0.1
    nsf_sigma: 0.003
    nsf_voiced_threshold: 10
    upsample_rates: [8, 8]
    upsample_kernel_sizes: [16, 16]
    istft_params:
        n_fft: 16
        hop_len: 4
    resblock_kernel_sizes: [3, 7, 11]
    resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
    source_resblock_kernel_sizes: [7, 11]
    source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
    lrelu_slope: 0.1
    audio_limit: 0.99
    f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
        num_class: 1
        in_channels: 80
        cond_channels: 512

wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator

# processor functions
parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
    tokenizer_path: !ref <basemodel_path>
    tokenizer_name: "qwen-2.5"
allowed_special: 'all'
tokenize: !name:inspiremusic.dataset.processor.tokenize
    get_tokenizer: !ref <get_tokenizer>
    allowed_special: !ref <allowed_special>
filter: !name:inspiremusic.dataset.processor.filter
    max_length: 28000
    min_length: 0
    token_max_length: 200
    token_min_length: 1
resample: !name:inspiremusic.dataset.processor.resample
    resample_rate: !ref <sample_rate>
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
    n_fft: 1024
    num_mels: 128
    sampling_rate: !ref <sample_rate>
    hop_size: 256
    win_size: 1024
    fmin: 0
    fmax: 24000
    center: False
compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
    feat_extractor: !ref <feat_extractor>
parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
    normalize: True
shuffle: !name:inspiremusic.dataset.processor.shuffle
    shuffle_size: 1000
sort: !name:inspiremusic.dataset.processor.sort
    sort_size: 500  # sort_size should be less than shuffle_size
batch: !name:inspiremusic.dataset.processor.batch
    batch_type: 'dynamic'
    max_frames_in_batch: 10000 # llm 12000
padding: !name:inspiremusic.dataset.processor.padding

# dataset processor pipeline
data_pipeline: [
    !ref <parquet_opener>,
    !ref <tokenize>,
    !ref <shuffle>,
    !ref <sort>,
    !ref <filter>, 
    !ref <batch>,
    !ref <padding>,
]


# train conf
train_conf:
    optim: adam
    optim_conf:
        lr: 0.0001 # change to 0.001 if you want to train flow from scratch
    scheduler: warmuplr
    scheduler_conf:
        warmup_steps: 5000
    max_epoch: 200
    grad_clip: 5
    accum_grad: 2
    log_interval: 100
    save_per_step: 500