| NUM_ENCODEC_TARGETS=8 | |
| NUM_TOTAL_TARGETS=8 | |
| NUM_TARGET_TOKENS=1024 | |
| MASK_AMOUNT=150 | |
| MASK_GAP_SIZE=15 | |
| MASK_PROP=0.5 | |
| MODEL_DIM=768 | |
| NUM_ENCODER_LAYERS=10 | |
| NUM_ENCODER_HEADS=12 | |
| NUM_DECODER_LAYERS=2 | |
| NUM_DECODER_HEADS=12 | |
| MASKED_LOSS_WEIGHT=0.9 | |
| get_model.model[email protected] | |
| models.EncodecMAE: | |
| wav_encoder = @models.encodecmae.encoders.EncodecEncoder | |
| target_encoder = @models.encodecmae.targets.EncodecQuantizer | |
| masker = @models.encodecmae.masking.TimeGapMask | |
| visible_encoder = @encoder/models.transformers.TransformerEncoder | |
| positional_encoder = @models.transformers.SinusoidalPositionalEmbeddings | |
| decoder = @decoder/models.transformers.TransformerEncoder | |
| head = @models.encodecmae.heads.FrameLevelClassificationHead | |
| optimizer[email protected] | |
| lr_scheduler=None | |
| masked_weight=%MASKED_LOSS_WEIGHT | |
| quantizer_weights=[0.22407463, 0.1759858 , 0.14499009, 0.12150037, 0.10315603, 0.08831368, 0.07608274, 0.06589669, 1.0] | |
| n_extra_targets=1 | |
| torch.optim.AdamW: | |
| lr=%PRETRAIN_MAX_LR | |
| betas=(0.9,0.95) | |
| weight_decay=0.05 | |
| models.encodecmae.targets.EncodecQuantizer: | |
| n = %NUM_ENCODEC_TARGETS | |
| models.encodecmae.masking.TimeGapMask: | |
| mask_amount = %MASK_AMOUNT | |
| gap_size = %MASK_GAP_SIZE | |
| mask_prop = %MASK_PROP | |
| encoder/models.transformers.TransformerEncoder: | |
| model_dim=%MODEL_DIM | |
| num_layers=%NUM_ENCODER_LAYERS | |
| attention_layer=@encoder/models.transformers.MultiHeadAttention | |
| compile=True | |
| encoder/models.transformers.MultiHeadAttention: | |
| model_dim=%MODEL_DIM | |
| num_heads=%NUM_ENCODER_HEADS | |
| decoder/models.transformers.TransformerEncoder: | |
| model_dim=%MODEL_DIM | |
| num_layers=%NUM_DECODER_LAYERS | |
| attention_layer=@decoder/models.transformers.MultiHeadAttention | |
| compile=True | |
| decoder/models.transformers.MultiHeadAttention: | |
| model_dim=%MODEL_DIM | |
| num_heads=%NUM_DECODER_HEADS | |
| models.transformers.SinusoidalPositionalEmbeddings.embedding_dim = %MODEL_DIM | |
| models.encodecmae.heads.FrameLevelClassificationHead: | |
| model_dim=%MODEL_DIM | |
| num_tokens=%NUM_TARGET_TOKENS | |
| num_streams=%NUM_TOTAL_TARGETS |