defaults: - common train: batch_size: 128 betas: [0.8, 0.99] c_kl: 1.0 c_mel: 45 distributed: false # BUG: multi-gpu is not working use_multiprocessing: false # BUG: multi-gpu is not working epochs: 20 eps: 1e-9 fp16_run: false init_lr_ratio: 1 raise_error: false learning_rate: 2e-4 log_interval: 10 log_level: ${log_level} lr_decay: 0.98 max_speclen: 128 port: 8005 resume_training: false # set to false to finetune from a model seed: 1234 segment_size: 8960 use_sr: false valid_epoch_interval: 1 valid_steps_interval: 1000 save_epoch_interval: 10 save_steps_interval: 1000 warmup_epochs: 0 # weighted_batch_speaker_sampling : false # weighted_batch_lang_sampling : false weighted_batch_speaker_sampling : 0.5 weighted_batch_lang_sampling : 0.5 data: dataset_dir: /raid/lucasgris/free-svc/data filter_length: 1280 hop_length: 320 max_wav_value: 32768.0 mel_fmax: null mel_fmin: 0.0 n_mel_channels: 80 num_workers: 64 # For pitch extraction, set the pitch_predictor (will compute in dataloader) or pitch_features_dir (will load from disk) pitch_predictor: rmvpe # pm | crepe | harvest | dio | rmvpe | fcpe pitch_features_dir: ${data.dataset_dir}/pitch_features/ sampling_rate: 24000 spectrogram_dir: null #${data.dataset_dir}/spectrograms # it is recommended NOT to use if you have small disk space # For speaker embedding extraction, set the use_spk_emb to True and spk_embeddings_dir (will load from disk) or configure the model to compute it on forward use_spk_emb: true spk_embeddings_dir: ${data.dataset_dir}/spk_embeddings # SR augmentation is deprecated, set use_sr to False sr_min_max: [68, 92] # For content feature extraction, set the content_feature_dir (will load from disk) or configure the model to compute it on forward content_feature_dir: null training_files: data/train.csv validation_files: data/valid.csv win_length: 1280 model: save_dir: null filter_channels: 768 finetune_from_model: discriminator: /raid/lucasgris/free-svc/D-freevc-24.pth generator: /raid/lucasgris/free-svc/freevc-24.pth hidden_channels: 192 inter_channels: 192 kernel_size: 3 n_heads: 2 n_layers_q: 3 n_layers: 6 p_dropout: 0.1 resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] resblock_kernel_sizes: [3,7,11] resblock: 1 c_dim: 768 upsample_initial_channel: 512 upsample_kernel_sizes: [16,16,4,4] upsample_rates: [10,8,2,2] use_spectral_norm: false freeze_external_spk: true device: cuda # For online speaker embedding extraction, set the use_spk_emb to True and spk_encoder_type use_spk_emb: false gin_channels: null # gin_channels = spk_encoder.embedding_dim spk_encoder_type: null # ECAPA2SpeakerEncoder16k | # For content feature extraction, set the content_encoder_type and content_encoder_ckpt content_encoder_type: null # load from disk (data) - hubert | wavlm content_encoder_ckpt: null # load from disk (data) - [path] | models/wavlm/WavLM-Large.pt | lengyue233/content-vec-best post_content_encoder_type: vits-encoder-with-uv-emb # or freevc-bottleneck coarse_f0: true cond_f0_on_flow: false