speechbrain
/

mtl-mimic-voicebank

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:eea2ed64b9b136ccfa66741860d47b4a3ea6954bb8eb07d3212a14b601a0d3fb
-size 29005818

 version https://git-lfs.github.com/spec/v1
+oid sha256:348bdc866632457e60d9eea38aa9a511910b89cd0c1ad1b78c229535bd5b60e6
+size 89230845

hyperparams.yaml CHANGED Viewed

@@ -4,67 +4,21 @@ n_fft: 512
 win_length: 32
 hop_length: 16
-# Enhancement model args
-emb_channels: 1024
-emb_kernel_size: 3
-emb_padding: same
-enhancer_size: 512
-enhancer_layers: 8
-enhancer_heads: 8
-enhancer_causal: False
-enhancer_drop_rate: 0.1
-compute_stft: !new:speechbrain.processing.features.STFT
-    sample_rate: !ref <sample_rate>
     n_fft: !ref <n_fft>
     win_length: !ref <win_length>
     hop_length: !ref <hop_length>
-compute_istft: !new:speechbrain.processing.features.ISTFT
     sample_rate: !ref <sample_rate>
-    n_fft: !ref <n_fft>
-    win_length: !ref <win_length>
-    hop_length: !ref <hop_length>
-spectral_magnitude: !name:speechbrain.processing.features.spectral_magnitude
-    power: 0.5
-resynth: !name:speechbrain.processing.signal_processing.resynthesize
-    stft: !ref <compute_stft>
-    istft: !ref <compute_istft>
-enhance_model: !new:speechbrain.lobes.models.transformer.TransformerSE.CNNTransformerSE
-    output_size: !ref <n_fft> // 2 + 1
-    d_model: !ref <n_fft> // 2
-    output_activation: !name:torch.nn.ReLU
-    activation: !name:torch.nn.LeakyReLU
-    dropout: !ref <enhancer_drop_rate>
-    num_layers: !ref <enhancer_layers>
-    d_ffn: !ref <enhancer_size>
-    nhead: !ref <enhancer_heads>
-    causal: !ref <enhancer_causal>
-    custom_emb_module: !new:speechbrain.nnet.containers.Sequential
-        input_shape: [null, null, !ref <n_fft> // 2 + 1]
-        conv1: !name:speechbrain.nnet.CNN.Conv1d
-            out_channels: !ref <emb_channels>
-            kernel_size: 3
-        norm1: !name:speechbrain.nnet.normalization.LayerNorm
-        act1: !new:torch.nn.LeakyReLU
-        conv2: !name:speechbrain.nnet.CNN.Conv1d
-            out_channels: !ref <emb_channels> // 2
-            kernel_size: 3
-        norm2: !name:speechbrain.nnet.normalization.LayerNorm
-        act2: !new:torch.nn.LeakyReLU
-        conv3: !name:speechbrain.nnet.CNN.Conv1d
-            out_channels: !ref <emb_channels> // 4
-            kernel_size: 3
-        norm3: !name:speechbrain.nnet.normalization.LayerNorm
-        act3: !new:torch.nn.LeakyReLU
-        conv4: !name:speechbrain.nnet.CNN.Conv1d
-            out_channels: !ref <emb_channels> // 4
-            kernel_size: 3
-        norm4: !name:speechbrain.nnet.normalization.LayerNorm
-        act4: !new:torch.nn.LeakyReLU
 modules:
     enhance_model: !ref <enhance_model>

 win_length: 32
 hop_length: 16
+mask_weight: 0.99
+# Enhancement model args
+enhance_model: !new:speechbrain.lobes.models.EnhanceResnet.EnhanceResnet
     n_fft: !ref <n_fft>
     win_length: !ref <win_length>
     hop_length: !ref <hop_length>
     sample_rate: !ref <sample_rate>
+    channel_counts: [128, 128, 256, 256, 512, 512]
+    normalization: !name:speechbrain.nnet.normalization.BatchNorm2d
+    activation: !new:torch.nn.GELU
+    dense_count: 2
+    dense_nodes: 1024
+    dropout: 0.1
+    mask_weight: !ref <mask_weight>
 modules:
     enhance_model: !ref <enhance_model>