Spaces:

MLSpeech
/

speech-trim-silence

Sleeping

App Files Files Community

MLSpeech commited on Jun 13

Commit

4fe4892

verified ·

1 Parent(s): 33b8135

Allow upload of all file types

Browse files

Files changed (1) hide show

app.py +414 -414

app.py CHANGED Viewed

@@ -1,414 +1,414 @@
-import argparse
-import logging
-from pathlib import Path
-from tqdm import tqdm
-import torch
-import torchaudio
-import soundfile as sf
-import time
-from typing import TypedDict
-from enum import Enum
-import gradio as gr
-SR = 16000
-VAD_EXPAND_HEAD_SEC = 0.2
-VAD_EXPAND_TAIL_SEC = 0.2
-class SPEECH_ARRAY_INDEX(TypedDict):
-    """
-    TypedDict for representing speech segments in audio.
-    This dictionary contains the start and end indices of a speech segment retrieved from VAD processing.
-    Args:
-        start (float): Start index of the speech segment in samples.
-        end (float): End index of the speech segment in samples.
-    """
-    start: float
-    end: float
-class SilenceTrimMode(Enum):
-    """
-    Enumeration for different silence trimming modes in audio processing.
-    This enum defines various options for trimming silence from audio segments,
-    allowing fine-grained control over which parts of the audio should have
-    silence removed.
-    Attributes:
-        LEADING (str): Remove silence only from the beginning of the audio.
-        TRAILING (str): Remove silence only from the end of the audio.
-        EDGES (str): Remove silence from both the beginning and end of the audio.
-        ALL (str): Remove all silence segments throughout the entire audio.
-    """
-    LEADING = "leading"
-    TRAILING = "trailing"
-    EDGES = "edges"
-    ALL = "all"
-class VAD:
-    def __init__(
-        self,
-        sr: int,
-        remove_short: bool = False,
-        pad_segments: bool = True,
-        expand_head_sec: float = VAD_EXPAND_HEAD_SEC,
-        expand_tail_sec: float = VAD_EXPAND_TAIL_SEC,
-        trim_mode: SilenceTrimMode = SilenceTrimMode.EDGES,
-    ):
-        """Initialize the VAD processor.
-        Args:
-            sr (int): Sampling rate of input audio.
-            remove_short (bool): Whether to remove short speech segments. Default is False.
-            pad_segments (bool): Whether to expand detected segments with padding. Default is True.
-            expand_head_sec (float): Padding in seconds to add before each segment. Default is 0.2.
-            expand_tail_sec (float): Padding in seconds to add after each segment. Default is 0.2.
-            trim_mode (SilenceTrimMode): Mode to use for trimming silence. Default is trim silence from edges. Options are:
-                - SilenceTrimMode.LEADING: Remove silence only from the beginning.
-                - SilenceTrimMode.TRAILING: Remove silence only from the end.
-                - SilenceTrimMode.EDGES: Remove silence from both the beginning and end.
-                - SilenceTrimMode.ALL: Remove all silence segments throughout the audio.
-        """
-        self.sr = sr
-        self.pad_segments = pad_segments
-        self.remove_short = remove_short
-        self.expand_head_sec = expand_head_sec
-        self.expand_tail_sec = expand_tail_sec
-        self.trim_mode = trim_mode
-        self.min_segment_dur = 1.0
-        vad_components = torch.hub.load(
-            repo_or_dir="snakers4/silero-vad",
-            model="silero_vad",
-            trust_repo=True,
-            skip_validation=True,
-        )
-        self.vad_model, utils = vad_components  # type: ignore
-        self._detect_speech, _, _, *_ = utils
-    def _remove_short_segments(self, segments: list[SPEECH_ARRAY_INDEX]) -> list[SPEECH_ARRAY_INDEX]:
-        """Remove speech segments shorter than the configured minimum duration."""
-        return [s for s in segments if s["end"] - s["start"] > self.min_segment_dur * self.sr]
-    def _expand_segments(
-        self, segments: list[SPEECH_ARRAY_INDEX], expand_head: int, expand_tail: int, total_length: int
-    ) -> list[SPEECH_ARRAY_INDEX]:
-        """Expand speech segments with padding before and after, constrained by surrounding segments and total length.
-        Args:
-            segments (list[SPEECH_ARRAY_INDEX]): List of speech segments.
-            expand_head (int): Padding to add before each segment in samples.
-            expand_tail (int): Padding to add after each segment in samples.
-            total_length (int): Total length of the audio in samples.
-        Returns:
-            list[SPEECH_ARRAY_INDEX]: Expanded list of speech segments.
-        """
-        results = []
-        for i, t in enumerate(segments):
-            start = max(t["start"] - expand_head, segments[i - 1]["end"] if i > 0 else 0)
-            end = min(t["end"] + expand_tail, segments[i + 1]["start"] if i < len(segments) - 1 else total_length)
-            results.append({"start": start, "end": end})
-        return results
-    def _postprocess_segments(
-        self, segments: list[SPEECH_ARRAY_INDEX], audio_len: int
-    ) -> list[SPEECH_ARRAY_INDEX]:
-        """Apply filtering and padding to detected speech segments. If no segments are detected, return a default segment covering the entire audio.
-        Args:
-            segments (list[SPEECH_ARRAY_INDEX]): Detected speech segments.
-            audio_len (int): Length of the audio signal in samples. Used to ensure segments do not exceed audio length.
-        Returns:
-            list[SPEECH_ARRAY_INDEX]: Postprocessed speech segments.
-        """
-        if self.remove_short:
-            segments = self._remove_short_segments(segments)
-        if self.pad_segments:
-            expand_head = int(self.expand_head_sec * self.sr)
-            expand_tail = int(self.expand_tail_sec * self.sr)
-            segments = self._expand_segments(segments, expand_head, expand_tail, audio_len)
-        return segments if segments else [{"start": 0, "end": audio_len}]
-    def _trim_audio(self, audio: torch.Tensor, segments: list[SPEECH_ARRAY_INDEX]) -> torch.Tensor:
-        """Trim the input audio tensor according to the configured silence trim mode.
-        Args:
-            audio (torch.Tensor): Input audio tensor.
-            segments (list[SPEECH_ARRAY_INDEX]): Processed speech segments.
-        Returns:
-            torch.Tensor: Trimmed audio tensor.
-        """
-        if not segments:
-            return audio.unsqueeze(0)
-        if self.trim_mode is SilenceTrimMode.ALL:
-            speech = torch.cat([audio[int(s["start"]):int(s["end"])] for s in segments])
-        else:
-            first_start = int(segments[0]["start"])
-            last_end = int(segments[-1]["end"])
-            if self.trim_mode is SilenceTrimMode.LEADING:
-                speech = audio[first_start:]
-            elif self.trim_mode is SilenceTrimMode.TRAILING:
-                speech = audio[:last_end]
-            elif self.trim_mode is SilenceTrimMode.EDGES:
-                speech = audio[first_start:last_end]
-            else:
-                raise ValueError(f"Unsupported trim_mode: {self.trim_mode}")
-        return speech.unsqueeze(0)
-    def __call__(self, audio: torch.Tensor) -> torch.Tensor:
-        """Apply VAD processing and silence trimming to an audio tensor.
-        Args:
-            audio (torch.Tensor): Audio tensor, either [samples] or [1, samples].
-        Returns:
-            torch.Tensor: Trimmed audio tensor with silence removed.
-        """
-        if audio.dim() == 2:
-            audio = audio[0]
-        tic = time.time()
-        segments = self._detect_speech(audio, model=self.vad_model, sampling_rate=self.sr)
-        segments = self._postprocess_segments(segments, len(audio))
-        logging.debug(f"Detected speech in {time.time() - tic:.1f} sec")
-        return self._trim_audio(audio, segments)
-def preprocess_input_lst(input_lst_path: str) -> list[Path]:
-    """
-    Load a list of audio file paths from a text file.
-    Args:
-        input_lst_path (str): Path to a text file containing audio file paths, one per line.
-    Returns:
-        list[Path]: List of audio file paths.
-    """
-    with open(input_lst_path, "r") as f:
-        return [Path(line.strip()) for line in f if line.strip()]
-def preprocess_input_dir(input_dir: Path) -> list[Path]:
-    """
-    Recursively collect all .wav audio file paths from a directory.
-    Args:
-        input_dir (Path): Path to the base directory to search for .wav files.
-    Returns:
-        list[Path]: List of full paths to .wav files.
-    """
-    return list(input_dir.rglob("*.wav"))
-def setup_logger(log_file: Path, verbose: bool = False) -> None:
-    """
-    Configure the logging module to write to file and stdout.
-    Args:
-        log_file (Path): Path to the log file.
-        verbose (bool, optional): Whether to enable verbose logging. Defaults to False.
-    """
-    log_file.parent.mkdir(parents=True, exist_ok=True)
-    logging.basicConfig(
-        level=logging.INFO if not verbose else logging.DEBUG,
-        format="%(asctime)s [%(levelname)s] %(message)s",
-        handlers=[logging.FileHandler(log_file, mode="w"), logging.StreamHandler()],
-    )
-def apply_vad(
-    input_lst: list[Path],
-    output_dir: Path,
-    input_base_dir: str | Path | None = None,
-    expand_head_sec: float = VAD_EXPAND_HEAD_SEC,
-    expand_tail_sec: float = VAD_EXPAND_TAIL_SEC,
-    trim_mode: SilenceTrimMode = SilenceTrimMode.EDGES,
-) -> None:
-    """
-    Apply VAD to a list of input audio files and save the processed outputs.
-    Args:
-        input_lst (list[Path]): List of audio file paths to process.
-        output_dir (Path): Directory to save the processed audio files.
-        input_base_dir (str | Path | None, optional): If provided, preserve directory structure relative to this base.
-    """
-    logging.info(f"Processing {len(input_lst)} files from {input_base_dir} to {output_dir}")
-    logging.info(f"Creating VAD model with sampling rate {SR} and expand head {expand_head_sec} sec")
-    vad = VAD(
-        sr=SR, pad_segments=True, expand_head_sec=expand_head_sec, expand_tail_sec=expand_tail_sec, trim_mode=trim_mode
-    )
-    for wav_file in tqdm(input_lst, desc="Applying VAD"):
-        try:
-            if input_base_dir is not None:
-                # Keep tree hierarchy relative to base dir
-                rel_path = wav_file.relative_to(input_base_dir)
-                out_file = output_dir / rel_path
-            else:
-                # Copy to output dir as is (just the filename)
-                out_file = output_dir / (wav_file.stem + "_vad" + wav_file.suffix)
-            out_file.parent.mkdir(parents=True, exist_ok=True)
-            audio, sr = torchaudio.load(str(wav_file))
-            if sr != SR:
-                audio = torchaudio.functional.resample(audio, sr, SR)
-                sr = SR
-            audio_vad = vad(audio)
-            sf.write(out_file, audio_vad.squeeze().numpy(), sr)
-            logging.debug(f"Saved: {out_file}")
-        except Exception as e:
-            logging.error(f"Failed to process {wav_file}: {e}")
-    print(f"VAD processing complete. Processed {len(input_lst)} files. Outputs saved to {output_dir}")
-def apply_vad_gradio(wav_file):
-	vad = VAD(sr=SR, pad_segments=True, expand_head_sec=0.2, expand_tail_sec=0.2, trim_mode=SilenceTrimMode.EDGES)
-	audio, sr = torchaudio.load(str(wav_file))
-	if sr != SR:
-		audio = torchaudio.functional.resample(audio, sr, SR)
-		sr = SR
-	audio_vad = vad(audio)
-	sf.write("output.wav", audio_vad.squeeze().numpy(), sr)
-	return 'output.wav'
-def parse_args() -> argparse.Namespace:
-    """
-    Parse command-line arguments for the VAD processing script.
-    Returns:
-        argparse.Namespace: Parsed arguments.
-    """
-    parser = argparse.ArgumentParser(description="Apply VAD to all .wav files in a directory tree.")
-    parser.add_argument(
-        "--input_dir",
-        type=Path,
-        help="Path to input directory. Also used as the base input directory for relative paths.",
-    )
-    parser.add_argument("--input_lst", type=Path, help="Path to input list file with audio paths")
-    parser.add_argument("--output_dir", type=Path, help="Path to output directory")
-    parser.add_argument("--debug_file", type=Path, help="Optional: Path to a single file to test VAD on")
-    parser.add_argument("--expand_head_sec", type=float, default=VAD_EXPAND_HEAD_SEC)
-    parser.add_argument("--expand_tail_sec", type=float, default=VAD_EXPAND_TAIL_SEC)
-    parser.add_argument(
-        "--trim_mode",
-        type=str,
-        default=SilenceTrimMode.EDGES.value,
-        choices=[m.value for m in SilenceTrimMode],
-        help="Silence trim mode: " + ", ".join(m.value for m in SilenceTrimMode),
-    )
-    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
-    args = parser.parse_args()
-    # Validation logic
-    if args.debug_file:
-        # Debug mode - only debug_file is needed
-        if args.input_dir or args.input_lst or args.output_dir:
-            parser.error("When using --debug_file, do not provide --input_dir, --input_lst, or --output_dir.")
-    else:
-        # Normal mode - need output_dir and either input_dir or input_lst
-        if not args.output_dir:
-            parser.error("--output_dir is required when not using --debug_file.")
-        if not args.input_dir and not args.input_lst:
-            parser.error("Either --input_dir or --input_lst must be provided when not using --debug_file.")
-    args.trim_mode = SilenceTrimMode(args.trim_mode)
-    return args
-def run_debug_file(
-    debug_file: str,
-    expand_head_sec: float = VAD_EXPAND_HEAD_SEC,
-    expand_tail_sec: float = VAD_EXPAND_TAIL_SEC,
-    trim_mode: SilenceTrimMode = SilenceTrimMode.EDGES,
-) -> None:
-    """
-    Run VAD on a single debug audio file and save the result.
-    Args:
-        debug_file (str): Path to the debug audio file.
-        expand_head_sec (float): Padding duration in seconds before each segment.
-        expand_tail_sec (float): Padding duration in seconds after each segment.
-    """
-    debug_path = Path(debug_file).resolve()
-    logging.info(f"Running VAD debug on: {debug_path}")
-    audio, sr = torchaudio.load(debug_path)
-    if sr != SR:
-        logging.info(f"Resampling from {sr} → {SR}")
-        audio = torchaudio.functional.resample(audio, sr, SR)
-        sr = SR
-    vad = VAD(
-        sr=SR, pad_segments=True, expand_head_sec=expand_head_sec, expand_tail_sec=expand_tail_sec, trim_mode=trim_mode
-    )
-    audio_vad = vad(audio)
-    out_path = debug_path.with_name(debug_path.stem + "_vad.wav")
-    sf.write(out_path, audio_vad.squeeze().numpy(), sr)
-    logging.info(f"Saved VAD output to: {out_path}")
-with gr.Blocks() as demo:
-	with gr.Row():
-		inputFile = gr.File(label="wav files", file_count="single", file_types=[".wav"])
-		runbtn = gr.Button("Run")
-	audio = gr.Audio(label="output")
-	runbtn.click(fn=apply_vad_gradio, inputs=[inputFile], outputs=audio)
-if __name__ == "__main__":
-    demo.launch(ssr_mode=False)
-    # Optional: override args for debugging
-    #import sys
-    # sys.argv = [
-#         "script.py",
-#         "--output_dir",
-#         "/mlspeech/data/eyalcohen/datasets/intelligibility/sandi2025_challenge/tts_data/debug_train_files/with_vad_head_03_tail_03",
-#         "--input_lst",
-#         "/mlspeech/data/eyalcohen/datasets/intelligibility/sandi2025_challenge/tts_data/with_vad/normalized/debug_train_files.txt",
-#         "--expand_head_sec",
-#         "0.3",
-#         "--expand_tail_sec",
-#         "0.3",
-#         "--verbose",
-#         # "--debug_file",
-#         # "/mlspeech/data/eyalcohen/datasets/intelligibility/sandi2025_challenge/tts_data/no_vad/normalized/train/sla-P1/SI137O-00982-P10005-AM_FENRIR.wav",
-#     ]
-#
-#     args = parse_args()
-#     log_file = args.output_dir / "vad_processing.log"
-#     setup_logger(log_file, verbose=args.verbose)
-#     if args.debug_file:
-#         run_debug_file(args.debug_file, args.expand_head_sec, args.expand_tail_sec, args.trim_mode)
-#     else:
-#         if args.input_lst:
-#             input_lst = preprocess_input_lst(args.input_lst)
-#         elif args.input_dir:
-#             input_lst = preprocess_input_dir(args.input_dir)
-#         else:
-#             raise ValueError("Either --input_lst or --input_dir must be provided.")
-#         apply_vad(
-#             input_lst,
-#             args.output_dir,
-#             args.input_dir,
-#             args.expand_head_sec,
-#             args.expand_tail_sec,
-#             trim_mode=args.trim_mode,
-#         )

+import argparse
+import logging
+from pathlib import Path
+from tqdm import tqdm
+import torch
+import torchaudio
+import soundfile as sf
+import time
+from typing import TypedDict
+from enum import Enum
+import gradio as gr
+SR = 16000
+VAD_EXPAND_HEAD_SEC = 0.2
+VAD_EXPAND_TAIL_SEC = 0.2
+class SPEECH_ARRAY_INDEX(TypedDict):
+    """
+    TypedDict for representing speech segments in audio.
+    This dictionary contains the start and end indices of a speech segment retrieved from VAD processing.
+    Args:
+        start (float): Start index of the speech segment in samples.
+        end (float): End index of the speech segment in samples.
+    """
+    start: float
+    end: float
+class SilenceTrimMode(Enum):
+    """
+    Enumeration for different silence trimming modes in audio processing.
+    This enum defines various options for trimming silence from audio segments,
+    allowing fine-grained control over which parts of the audio should have
+    silence removed.
+    Attributes:
+        LEADING (str): Remove silence only from the beginning of the audio.
+        TRAILING (str): Remove silence only from the end of the audio.
+        EDGES (str): Remove silence from both the beginning and end of the audio.
+        ALL (str): Remove all silence segments throughout the entire audio.
+    """
+    LEADING = "leading"
+    TRAILING = "trailing"
+    EDGES = "edges"
+    ALL = "all"
+class VAD:
+    def __init__(
+        self,
+        sr: int,
+        remove_short: bool = False,
+        pad_segments: bool = True,
+        expand_head_sec: float = VAD_EXPAND_HEAD_SEC,
+        expand_tail_sec: float = VAD_EXPAND_TAIL_SEC,
+        trim_mode: SilenceTrimMode = SilenceTrimMode.EDGES,
+    ):
+        """Initialize the VAD processor.
+        Args:
+            sr (int): Sampling rate of input audio.
+            remove_short (bool): Whether to remove short speech segments. Default is False.
+            pad_segments (bool): Whether to expand detected segments with padding. Default is True.
+            expand_head_sec (float): Padding in seconds to add before each segment. Default is 0.2.
+            expand_tail_sec (float): Padding in seconds to add after each segment. Default is 0.2.
+            trim_mode (SilenceTrimMode): Mode to use for trimming silence. Default is trim silence from edges. Options are:
+                - SilenceTrimMode.LEADING: Remove silence only from the beginning.
+                - SilenceTrimMode.TRAILING: Remove silence only from the end.
+                - SilenceTrimMode.EDGES: Remove silence from both the beginning and end.
+                - SilenceTrimMode.ALL: Remove all silence segments throughout the audio.
+        """
+        self.sr = sr
+        self.pad_segments = pad_segments
+        self.remove_short = remove_short
+        self.expand_head_sec = expand_head_sec
+        self.expand_tail_sec = expand_tail_sec
+        self.trim_mode = trim_mode
+        self.min_segment_dur = 1.0
+        vad_components = torch.hub.load(
+            repo_or_dir="snakers4/silero-vad",
+            model="silero_vad",
+            trust_repo=True,
+            skip_validation=True,
+        )
+        self.vad_model, utils = vad_components  # type: ignore
+        self._detect_speech, _, _, *_ = utils
+    def _remove_short_segments(self, segments: list[SPEECH_ARRAY_INDEX]) -> list[SPEECH_ARRAY_INDEX]:
+        """Remove speech segments shorter than the configured minimum duration."""
+        return [s for s in segments if s["end"] - s["start"] > self.min_segment_dur * self.sr]
+    def _expand_segments(
+        self, segments: list[SPEECH_ARRAY_INDEX], expand_head: int, expand_tail: int, total_length: int
+    ) -> list[SPEECH_ARRAY_INDEX]:
+        """Expand speech segments with padding before and after, constrained by surrounding segments and total length.
+        Args:
+            segments (list[SPEECH_ARRAY_INDEX]): List of speech segments.
+            expand_head (int): Padding to add before each segment in samples.
+            expand_tail (int): Padding to add after each segment in samples.
+            total_length (int): Total length of the audio in samples.
+        Returns:
+            list[SPEECH_ARRAY_INDEX]: Expanded list of speech segments.
+        """
+        results = []
+        for i, t in enumerate(segments):
+            start = max(t["start"] - expand_head, segments[i - 1]["end"] if i > 0 else 0)
+            end = min(t["end"] + expand_tail, segments[i + 1]["start"] if i < len(segments) - 1 else total_length)
+            results.append({"start": start, "end": end})
+        return results
+    def _postprocess_segments(
+        self, segments: list[SPEECH_ARRAY_INDEX], audio_len: int
+    ) -> list[SPEECH_ARRAY_INDEX]:
+        """Apply filtering and padding to detected speech segments. If no segments are detected, return a default segment covering the entire audio.
+        Args:
+            segments (list[SPEECH_ARRAY_INDEX]): Detected speech segments.
+            audio_len (int): Length of the audio signal in samples. Used to ensure segments do not exceed audio length.
+        Returns:
+            list[SPEECH_ARRAY_INDEX]: Postprocessed speech segments.
+        """
+        if self.remove_short:
+            segments = self._remove_short_segments(segments)
+        if self.pad_segments:
+            expand_head = int(self.expand_head_sec * self.sr)
+            expand_tail = int(self.expand_tail_sec * self.sr)
+            segments = self._expand_segments(segments, expand_head, expand_tail, audio_len)
+        return segments if segments else [{"start": 0, "end": audio_len}]
+    def _trim_audio(self, audio: torch.Tensor, segments: list[SPEECH_ARRAY_INDEX]) -> torch.Tensor:
+        """Trim the input audio tensor according to the configured silence trim mode.
+        Args:
+            audio (torch.Tensor): Input audio tensor.
+            segments (list[SPEECH_ARRAY_INDEX]): Processed speech segments.
+        Returns:
+            torch.Tensor: Trimmed audio tensor.
+        """
+        if not segments:
+            return audio.unsqueeze(0)
+        if self.trim_mode is SilenceTrimMode.ALL:
+            speech = torch.cat([audio[int(s["start"]):int(s["end"])] for s in segments])
+        else:
+            first_start = int(segments[0]["start"])
+            last_end = int(segments[-1]["end"])
+            if self.trim_mode is SilenceTrimMode.LEADING:
+                speech = audio[first_start:]
+            elif self.trim_mode is SilenceTrimMode.TRAILING:
+                speech = audio[:last_end]
+            elif self.trim_mode is SilenceTrimMode.EDGES:
+                speech = audio[first_start:last_end]
+            else:
+                raise ValueError(f"Unsupported trim_mode: {self.trim_mode}")
+        return speech.unsqueeze(0)
+    def __call__(self, audio: torch.Tensor) -> torch.Tensor:
+        """Apply VAD processing and silence trimming to an audio tensor.
+        Args:
+            audio (torch.Tensor): Audio tensor, either [samples] or [1, samples].
+        Returns:
+            torch.Tensor: Trimmed audio tensor with silence removed.
+        """
+        if audio.dim() == 2:
+            audio = audio[0]
+        tic = time.time()
+        segments = self._detect_speech(audio, model=self.vad_model, sampling_rate=self.sr)
+        segments = self._postprocess_segments(segments, len(audio))
+        logging.debug(f"Detected speech in {time.time() - tic:.1f} sec")
+        return self._trim_audio(audio, segments)
+def preprocess_input_lst(input_lst_path: str) -> list[Path]:
+    """
+    Load a list of audio file paths from a text file.
+    Args:
+        input_lst_path (str): Path to a text file containing audio file paths, one per line.
+    Returns:
+        list[Path]: List of audio file paths.
+    """
+    with open(input_lst_path, "r") as f:
+        return [Path(line.strip()) for line in f if line.strip()]
+def preprocess_input_dir(input_dir: Path) -> list[Path]:
+    """
+    Recursively collect all .wav audio file paths from a directory.
+    Args:
+        input_dir (Path): Path to the base directory to search for .wav files.
+    Returns:
+        list[Path]: List of full paths to .wav files.
+    """
+    return list(input_dir.rglob("*.wav"))
+def setup_logger(log_file: Path, verbose: bool = False) -> None:
+    """
+    Configure the logging module to write to file and stdout.
+    Args:
+        log_file (Path): Path to the log file.
+        verbose (bool, optional): Whether to enable verbose logging. Defaults to False.
+    """
+    log_file.parent.mkdir(parents=True, exist_ok=True)
+    logging.basicConfig(
+        level=logging.INFO if not verbose else logging.DEBUG,
+        format="%(asctime)s [%(levelname)s] %(message)s",
+        handlers=[logging.FileHandler(log_file, mode="w"), logging.StreamHandler()],
+    )
+def apply_vad(
+    input_lst: list[Path],
+    output_dir: Path,
+    input_base_dir: str | Path | None = None,
+    expand_head_sec: float = VAD_EXPAND_HEAD_SEC,
+    expand_tail_sec: float = VAD_EXPAND_TAIL_SEC,
+    trim_mode: SilenceTrimMode = SilenceTrimMode.EDGES,
+) -> None:
+    """
+    Apply VAD to a list of input audio files and save the processed outputs.
+    Args:
+        input_lst (list[Path]): List of audio file paths to process.
+        output_dir (Path): Directory to save the processed audio files.
+        input_base_dir (str | Path | None, optional): If provided, preserve directory structure relative to this base.
+    """
+    logging.info(f"Processing {len(input_lst)} files from {input_base_dir} to {output_dir}")
+    logging.info(f"Creating VAD model with sampling rate {SR} and expand head {expand_head_sec} sec")
+    vad = VAD(
+        sr=SR, pad_segments=True, expand_head_sec=expand_head_sec, expand_tail_sec=expand_tail_sec, trim_mode=trim_mode
+    )
+    for wav_file in tqdm(input_lst, desc="Applying VAD"):
+        try:
+            if input_base_dir is not None:
+                # Keep tree hierarchy relative to base dir
+                rel_path = wav_file.relative_to(input_base_dir)
+                out_file = output_dir / rel_path
+            else:
+                # Copy to output dir as is (just the filename)
+                out_file = output_dir / (wav_file.stem + "_vad" + wav_file.suffix)
+            out_file.parent.mkdir(parents=True, exist_ok=True)
+            audio, sr = torchaudio.load(str(wav_file))
+            if sr != SR:
+                audio = torchaudio.functional.resample(audio, sr, SR)
+                sr = SR
+            audio_vad = vad(audio)
+            sf.write(out_file, audio_vad.squeeze().numpy(), sr)
+            logging.debug(f"Saved: {out_file}")
+        except Exception as e:
+            logging.error(f"Failed to process {wav_file}: {e}")
+    print(f"VAD processing complete. Processed {len(input_lst)} files. Outputs saved to {output_dir}")
+def apply_vad_gradio(wav_file):
+	vad = VAD(sr=SR, pad_segments=True, expand_head_sec=0.2, expand_tail_sec=0.2, trim_mode=SilenceTrimMode.EDGES)
+	audio, sr = torchaudio.load(str(wav_file))
+	if sr != SR:
+		audio = torchaudio.functional.resample(audio, sr, SR)
+		sr = SR
+	audio_vad = vad(audio)
+	sf.write("output.wav", audio_vad.squeeze().numpy(), sr)
+	return 'output.wav'
+def parse_args() -> argparse.Namespace:
+    """
+    Parse command-line arguments for the VAD processing script.
+    Returns:
+        argparse.Namespace: Parsed arguments.
+    """
+    parser = argparse.ArgumentParser(description="Apply VAD to all .wav files in a directory tree.")
+    parser.add_argument(
+        "--input_dir",
+        type=Path,
+        help="Path to input directory. Also used as the base input directory for relative paths.",
+    )
+    parser.add_argument("--input_lst", type=Path, help="Path to input list file with audio paths")
+    parser.add_argument("--output_dir", type=Path, help="Path to output directory")
+    parser.add_argument("--debug_file", type=Path, help="Optional: Path to a single file to test VAD on")
+    parser.add_argument("--expand_head_sec", type=float, default=VAD_EXPAND_HEAD_SEC)
+    parser.add_argument("--expand_tail_sec", type=float, default=VAD_EXPAND_TAIL_SEC)
+    parser.add_argument(
+        "--trim_mode",
+        type=str,
+        default=SilenceTrimMode.EDGES.value,
+        choices=[m.value for m in SilenceTrimMode],
+        help="Silence trim mode: " + ", ".join(m.value for m in SilenceTrimMode),
+    )
+    parser.add_argument("--verbose", action="store_true", help="Enable verbose logging")
+    args = parser.parse_args()
+    # Validation logic
+    if args.debug_file:
+        # Debug mode - only debug_file is needed
+        if args.input_dir or args.input_lst or args.output_dir:
+            parser.error("When using --debug_file, do not provide --input_dir, --input_lst, or --output_dir.")
+    else:
+        # Normal mode - need output_dir and either input_dir or input_lst
+        if not args.output_dir:
+            parser.error("--output_dir is required when not using --debug_file.")
+        if not args.input_dir and not args.input_lst:
+            parser.error("Either --input_dir or --input_lst must be provided when not using --debug_file.")
+    args.trim_mode = SilenceTrimMode(args.trim_mode)
+    return args
+def run_debug_file(
+    debug_file: str,
+    expand_head_sec: float = VAD_EXPAND_HEAD_SEC,
+    expand_tail_sec: float = VAD_EXPAND_TAIL_SEC,
+    trim_mode: SilenceTrimMode = SilenceTrimMode.EDGES,
+) -> None:
+    """
+    Run VAD on a single debug audio file and save the result.
+    Args:
+        debug_file (str): Path to the debug audio file.
+        expand_head_sec (float): Padding duration in seconds before each segment.
+        expand_tail_sec (float): Padding duration in seconds after each segment.
+    """
+    debug_path = Path(debug_file).resolve()
+    logging.info(f"Running VAD debug on: {debug_path}")
+    audio, sr = torchaudio.load(debug_path)
+    if sr != SR:
+        logging.info(f"Resampling from {sr} → {SR}")
+        audio = torchaudio.functional.resample(audio, sr, SR)
+        sr = SR
+    vad = VAD(
+        sr=SR, pad_segments=True, expand_head_sec=expand_head_sec, expand_tail_sec=expand_tail_sec, trim_mode=trim_mode
+    )
+    audio_vad = vad(audio)
+    out_path = debug_path.with_name(debug_path.stem + "_vad.wav")
+    sf.write(out_path, audio_vad.squeeze().numpy(), sr)
+    logging.info(f"Saved VAD output to: {out_path}")
+with gr.Blocks() as demo:
+	with gr.Row():
+		inputFile = gr.File(label="wav files", file_count="single")
+		runbtn = gr.Button("Run")
+	audio = gr.Audio(label="output")
+	runbtn.click(fn=apply_vad_gradio, inputs=[inputFile], outputs=audio)
+if __name__ == "__main__":
+    demo.launch(ssr_mode=False)
+    # Optional: override args for debugging
+    #import sys
+    # sys.argv = [
+#         "script.py",
+#         "--output_dir",
+#         "/mlspeech/data/eyalcohen/datasets/intelligibility/sandi2025_challenge/tts_data/debug_train_files/with_vad_head_03_tail_03",
+#         "--input_lst",
+#         "/mlspeech/data/eyalcohen/datasets/intelligibility/sandi2025_challenge/tts_data/with_vad/normalized/debug_train_files.txt",
+#         "--expand_head_sec",
+#         "0.3",
+#         "--expand_tail_sec",
+#         "0.3",
+#         "--verbose",
+#         # "--debug_file",
+#         # "/mlspeech/data/eyalcohen/datasets/intelligibility/sandi2025_challenge/tts_data/no_vad/normalized/train/sla-P1/SI137O-00982-P10005-AM_FENRIR.wav",
+#     ]
+#
+#     args = parse_args()
+#     log_file = args.output_dir / "vad_processing.log"
+#     setup_logger(log_file, verbose=args.verbose)
+#     if args.debug_file:
+#         run_debug_file(args.debug_file, args.expand_head_sec, args.expand_tail_sec, args.trim_mode)
+#     else:
+#         if args.input_lst:
+#             input_lst = preprocess_input_lst(args.input_lst)
+#         elif args.input_dir:
+#             input_lst = preprocess_input_dir(args.input_dir)
+#         else:
+#             raise ValueError("Either --input_lst or --input_dir must be provided.")
+#         apply_vad(
+#             input_lst,
+#             args.output_dir,
+#             args.input_dir,
+#             args.expand_head_sec,
+#             args.expand_tail_sec,
+#             trim_mode=args.trim_mode,
+#         )