# Generated 2022-02-15 from: # /content/speechbrain/recipes/fluent-speech-commands/direct/hparams/train.yaml # yamllint disable # ############################################################################ # Model: Direct SLU # Encoder: Pre-trained ASR encoder -> LSTM # Decoder: GRU + beamsearch # Tokens: BPE with unigram # losses: NLL # Training: Fluent Speech Commands # Authors: Loren Lugosch, Mirco Ravanelli 2020 # ############################################################################ # Seed needs to be set at top of yaml, before objects with parameters are made seed: 112011 __set_seed: !apply:torch.manual_seed [112011] output_folder: results/BPE51/112011 save_folder: results/BPE51/112011/save train_log: results/BPE51/112011/train_log.txt # Data files data_folder: /content/fluent_speech_commands_dataset # e.g, /localscratch/fluent_speech_commands_dataset rir_folder: /content/fluent_speech_commands_dataset # Change it if needed csv_train: results/BPE51/112011/train.csv csv_valid: results/BPE51/112011/valid.csv csv_test: results/BPE51/112011/test.csv tokenizer_file: https://www.dropbox.com/s/hvf2huofnq0sjbn/51_unigram.model?dl=1 skip_prep: false # Training parameters number_of_epochs: 4 batch_size: 8 lr: 0.0003 token_type: unigram # ["unigram", "bpe", "char"] sorting: random # Model parameters sample_rate: 16000 emb_size: 128 dec_neurons: 512 output_neurons: 51 # index(eos/bos) = 0 ASR_encoder_dim: 512 encoder_dim: 256 # Decoding parameters bos_index: 0 eos_index: 0 min_decode_ratio: 0.0 max_decode_ratio: 10.0 slu_beam_size: 80 eos_threshold: 1.5 temperature: 1.25 dataloader_opts: batch_size: 8 shuffle: true epoch_counter: &id013 !new:speechbrain.utils.epoch_loop.EpochCounter limit: 4 # Models asr_model: !apply:speechbrain.pretrained.EncoderDecoderASR.from_hparams source: speechbrain/asr-crdnn-rnnlm-librispeech run_opts: {device: cuda:0} slu_enc: &id006 !new:speechbrain.nnet.containers.Sequential input_shape: [null, null, 512] lstm: !new:speechbrain.nnet.RNN.LSTM input_size: 512 bidirectional: true hidden_size: 256 num_layers: 2 linear: !new:speechbrain.nnet.linear.Linear input_size: 512 n_neurons: 256 output_emb: &id007 !new:speechbrain.nnet.embedding.Embedding num_embeddings: 51 embedding_dim: 128 dec: &id008 !new:speechbrain.nnet.RNN.AttentionalRNNDecoder enc_dim: 256 input_size: 128 rnn_type: gru attn_type: keyvalue hidden_size: 512 attn_dim: 512 num_layers: 3 scaling: 1.0 dropout: 0.0 seq_lin: &id009 !new:speechbrain.nnet.linear.Linear input_size: 512 n_neurons: 51 augment_wavedrop: &id001 !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: 16000 speeds: [100] augment_speed: &id002 !new:speechbrain.lobes.augment.TimeDomainSpecAugment sample_rate: 16000 speeds: [95, 100, 105] add_rev: &id003 !new:speechbrain.lobes.augment.EnvCorrupt openrir_folder: /content/fluent_speech_commands_dataset openrir_max_noise_len: 3.0 # seconds reverb_prob: 1.0 noise_prob: 0.0 noise_snr_low: 0 noise_snr_high: 15 rir_scale_factor: 1.0 add_noise: &id004 !new:speechbrain.lobes.augment.EnvCorrupt openrir_folder: /content/fluent_speech_commands_dataset openrir_max_noise_len: 3.0 # seconds reverb_prob: 0.0 noise_prob: 1.0 noise_snr_low: 0 noise_snr_high: 15 rir_scale_factor: 1.0 add_rev_noise: &id005 !new:speechbrain.lobes.augment.EnvCorrupt openrir_folder: /content/fluent_speech_commands_dataset openrir_max_noise_len: 3.0 # seconds reverb_prob: 1.0 noise_prob: 1.0 noise_snr_low: 0 noise_snr_high: 15 rir_scale_factor: 1.0 augment_pipeline: [*id001, *id002, *id003, *id004, *id005] modules: augment_wavedrop: *id001 augment_speed: *id002 add_rev: *id003 add_noise: *id004 add_rev_noise: *id005 slu_enc: *id006 output_emb: *id007 dec: *id008 seq_lin: *id009 model: &id011 !new:torch.nn.ModuleList - [*id006, *id007, *id008, *id009] tokenizer: &id010 !new:sentencepiece.SentencePieceProcessor pretrainer: !new:speechbrain.utils.parameter_transfer.Pretrainer collect_in: results/BPE51/112011/save/FSC_tokenizer loadables: tokenizer: *id010 paths: tokenizer: https://www.dropbox.com/s/hvf2huofnq0sjbn/51_unigram.model?dl=1 beam_searcher: !new:speechbrain.decoders.S2SRNNBeamSearcher embedding: *id007 decoder: *id008 linear: *id009 bos_index: 0 eos_index: 0 min_decode_ratio: 0.0 max_decode_ratio: 10.0 beam_size: 80 eos_threshold: 1.5 temperature: 1.25 using_max_attn_shift: false max_attn_shift: 30 coverage_penalty: 0. opt_class: !name:torch.optim.Adam lr: 0.0003 lr_annealing: &id012 !new:speechbrain.nnet.schedulers.NewBobScheduler initial_value: 0.0003 improvement_threshold: 0.0025 annealing_factor: 0.8 patient: 0 checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer checkpoints_dir: results/BPE51/112011/save recoverables: model: *id011 scheduler: *id012 counter: *id013 log_softmax: !new:speechbrain.nnet.activations.Softmax apply_log: true seq_cost: !name:speechbrain.nnet.losses.nll_loss label_smoothing: 0.1 train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger save_file: results/BPE51/112011/train_log.txt error_rate_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats cer_computer: !name:speechbrain.utils.metric_stats.ErrorRateStats split_tokens: true