File size: 5,641 Bytes
d075f56 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 |
# set random seed, so that you may reproduce your result.
__set_seed1: !apply:random.seed [1988]
__set_seed2: !apply:numpy.random.seed [1988]
__set_seed3: !apply:torch.manual_seed [1988]
__set_seed4: !apply:torch.cuda.manual_seed_all [1988]
# fixed params
sample_rate: 24000
text_encoder_input_size: 512
llm_input_size: 1536
llm_output_size: 1536
basemodel_path: '../../pretrained_models/InspireMusic-1.5B-Long/'
generator_path: '../../pretrained_models/InspireMusic-1.5B-Long/music_tokenizer'
# model params
# for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
# for system/third_party class/function, we do not require this.
llm: !new:inspiremusic.llm.llm.LLM
text_encoder_input_size: !ref <text_encoder_input_size>
llm_input_size: !ref <llm_input_size>
llm_output_size: !ref <llm_output_size>
audio_token_size: 4096
length_normalized_loss: True
lsm_weight: 0
text_encoder_conf:
name: "none"
llm: !new:inspiremusic.transformer.qwen_encoder.QwenEmbeddingEncoder
input_size: !ref <text_encoder_input_size>
pretrain_path: !ref <basemodel_path>
sampling: !name:inspiremusic.utils.common.topk_sampling
top_k: 350
train_cfg_ratio: 0.2
infer_cfg_ratio: 3.0
flow: !new:inspiremusic.flow.flow.MaskedDiff
input_size: 256
output_size: 80
output_type: 'mel'
vocab_size: 4096
input_frame_rate: 75
only_mask_loss: True
encoder: !new:inspiremusic.transformer.encoder.ConformerEncoder
output_size: 512
attention_heads: 4
linear_units: 1024
num_blocks: 3
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
normalize_before: True
input_layer: 'linear'
pos_enc_layer_type: 'rel_pos_espnet'
selfattention_layer_type: 'rel_selfattn'
input_size: 256
use_cnn_module: False
macaron_style: False
length_regulator: !new:inspiremusic.flow.length_regulator.InterpolateRegulator
channels: 512
sampling_ratios: [1, 1, 1, 1]
decoder: !new:inspiremusic.flow.flow_matching.ConditionalCFM
in_channels: 240
cfm_params: !new:omegaconf.DictConfig
content:
sigma_min: 1e-06
solver: 'euler'
t_scheduler: 'cosine'
training_cfg_rate: 0.2
inference_cfg_rate: 0.7
reg_loss_type: 'l1'
estimator: !new:inspiremusic.flow.decoder.ConditionalDecoder
in_channels: 1024
out_channels: 512
channels: [256, 256]
dropout: 0.0
attention_head_dim: 64
n_blocks: 4
num_mid_blocks: 8
num_heads: 8
act_fn: 'gelu'
generator_model_dir: !ref <generator_path>
hift: !new:inspiremusic.hifigan.generator.HiFTGenerator
in_channels: 80
base_channels: 512
nb_harmonics: 8
sampling_rate: !ref <sample_rate>
nsf_alpha: 0.1
nsf_sigma: 0.003
nsf_voiced_threshold: 10
upsample_rates: [8, 8]
upsample_kernel_sizes: [16, 16]
istft_params:
n_fft: 16
hop_len: 4
resblock_kernel_sizes: [3, 7, 11]
resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
source_resblock_kernel_sizes: [7, 11]
source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5]]
lrelu_slope: 0.1
audio_limit: 0.99
f0_predictor: !new:inspiremusic.hifigan.f0_predictor.ConvRNNF0Predictor
num_class: 1
in_channels: 80
cond_channels: 512
wavtokenizer: !new:inspiremusic.hifigan.generator.HiFTGenerator
# processor functions
parquet_opener: !name:inspiremusic.dataset.processor.parquet_opener
get_tokenizer: !name:inspiremusic.text.tokenizer.get_tokenizer
tokenizer_path: !ref <basemodel_path>
tokenizer_name: "qwen-2.5"
allowed_special: 'all'
tokenize: !name:inspiremusic.dataset.processor.tokenize
get_tokenizer: !ref <get_tokenizer>
allowed_special: !ref <allowed_special>
filter: !name:inspiremusic.dataset.processor.filter
max_length: 28000
min_length: 0
token_max_length: 200
token_min_length: 1
resample: !name:inspiremusic.dataset.processor.resample
resample_rate: !ref <sample_rate>
feat_extractor: !name:matcha.utils.audio.mel_spectrogram
n_fft: 1024
num_mels: 128
sampling_rate: !ref <sample_rate>
hop_size: 256
win_size: 1024
fmin: 0
fmax: 24000
center: False
compute_fbank: !name:inspiremusic.dataset.processor.compute_fbank
feat_extractor: !ref <feat_extractor>
parse_embedding: !name:inspiremusic.dataset.processor.parse_embedding
normalize: True
shuffle: !name:inspiremusic.dataset.processor.shuffle
shuffle_size: 1000
sort: !name:inspiremusic.dataset.processor.sort
sort_size: 500 # sort_size should be less than shuffle_size
batch: !name:inspiremusic.dataset.processor.batch
batch_type: 'dynamic'
max_frames_in_batch: 10000 # llm 12000
padding: !name:inspiremusic.dataset.processor.padding
# dataset processor pipeline
data_pipeline: [
!ref <parquet_opener>,
!ref <tokenize>,
!ref <shuffle>,
!ref <sort>,
!ref <filter>,
!ref <batch>,
!ref <padding>,
]
# train conf
train_conf:
optim: adam
optim_conf:
lr: 0.0001 # change to 0.001 if you want to train flow from scratch
scheduler: warmuplr
scheduler_conf:
warmup_steps: 5000
max_epoch: 200
grad_clip: 5
accum_grad: 2
log_interval: 100
save_per_step: 500
|