parakeet-tdt-0.6b-v2

Running

App Files Files Community

sungo-ganpare commited on May 24

Commit

a89d03e

1 Parent(s): 2311e41

ffmpegを使用して大容量音声ファイルの処理を改善し、音声長の取得機能を追加

Browse files

Files changed (3) hide show

.dockerignore +37 -0
Dockerfile +26 -0
transcribe_cli.py +157 -13

.dockerignore ADDED Viewed

	@@ -0,0 +1,37 @@

+# Git
+.git
+.gitignore
+# Python
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+# Virtual Environment
+.env
+.venv
+venv/
+ENV/
+# Output directories
+outputs/
+data/
+# IDE specific files
+.idea
+.vscode
+*.swp
+*.swo

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+# Use NVIDIA CUDA base image
+FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    python3 \
+    python3-pip \
+    git \
+    ffmpeg \
+    && rm -rf /var/lib/apt/lists/*
+# Copy only the necessary files
+COPY requirements.txt .
+COPY transcribe_cli.py .
+# Install Python dependencies
+RUN pip3 install --no-cache-dir -r requirements.txt
+# Set environment variable for CUDA device
+ENV CUDA_VISIBLE_DEVICES=0
+# Set default command
+ENTRYPOINT ["python3", "transcribe_cli.py", "/app/data"]

transcribe_cli.py CHANGED Viewed

@@ -13,6 +13,8 @@ import argparse
 import time # ★処理時間計測のために追加
 import sys # ★コマンドライン引数チェックのために追加
 from nemo.collections.asr.models import ASRModel # NeMo ASRモデル
 # --- グローバル設定 ---
 MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
@@ -42,14 +44,39 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
         audio_name_stem = audio_file_path.stem
         print(f"  音声ファイルをロード中: {original_path_name}")
-        audio = AudioSegment.from_file(audio_path_str)
-        duration_sec = audio.duration_seconds
         print(f"  音声長: {duration_sec:.2f} 秒")
     except FileNotFoundError:
         print(f"エラー: 音声ファイルが見つかりません: {audio_path_str}")
         return None, None, None
-    except Exception as load_e: # pydub.exceptions.CouldntDecodeError などを含む
         print(f"エラー: 音声ファイル '{original_path_name}' のロード/デコードに失敗しました: {load_e}")
         return None, None, None
@@ -81,24 +108,28 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
     elif audio.channels == 1:
         print("  音声は既にモノラルです。")
-    processed_temp_file_path_obj = None # 一時ファイルのPathオブジェクト
     # 前処理が行われた場合、一時ファイルに保存
     if resampled or mono_converted:
         try:
-            temp_suffix = "_preprocessed_temp.wav" # 一時ファイルとわかるような接尾辞
-            processed_temp_file_path_obj = Path(output_dir_for_temp_files, f"{audio_name_stem}{temp_suffix}")
             print(f"  前処理済み音声の一時保存先: {processed_temp_file_path_obj.name}")
             audio.export(processed_temp_file_path_obj, format="wav")
-            path_for_transcription = processed_temp_file_path_obj.as_posix() # 文字起こしに使用するパス
             display_name_for_info = f"{original_path_name} (前処理済み)"
         except Exception as export_e:
             print(f"エラー: 前処理済み音声のエクスポートに失敗しました: {export_e}")
-            # エクスポート失敗時、もしファイルが作られていたら削除試行
             if processed_temp_file_path_obj and processed_temp_file_path_obj.exists():
-                try: os.remove(processed_temp_file_path_obj)
-                except OSError: pass # 削除エラーはここでは致命的ではない
             return None, None, None
     else:
         # 前処理が不要だった場合
@@ -108,6 +139,33 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
     return path_for_transcription, display_name_for_info, duration_sec
 # --- 文字起こしコア関数 ---
 def transcribe_audio_cli(
@@ -373,10 +431,28 @@ def split_audio_with_overlap_cli(
     overlap_sec: int = CHUNK_OVERLAP_SECONDS
 ) -> List[str]:
     print(f"  音声分割中: 基本チャンク長 {chunk_length_sec}s, オーバーラップ {overlap_sec}s")
-    try: audio = AudioSegment.from_file(audio_path_str)
     except Exception as e:
-        print(f"  エラー: 音声ファイル '{Path(audio_path_str).name}' のロード中にエラー（分割処理）: {e}")
-        return []
     duration_ms = len(audio); chunk_length_ms = chunk_length_sec * 1000; overlap_ms = overlap_sec * 1000
     chunk_paths_list: List[str] = []; start_ms = 0; chunk_idx = 0
     audio_file_stem = Path(audio_path_str).stem
@@ -400,6 +476,74 @@ def split_audio_with_overlap_cli(
     print(f"  音声を {len(chunk_paths_list)} 個のチャンクに分割しました。")
     return chunk_paths_list
 # --- 単一ファイル処理のメインロジック ---
 def process_single_file(
     input_file_path_obj: Path,

 import time # ★処理時間計測のために追加
 import sys # ★コマンドライン引数チェックのために追加
 from nemo.collections.asr.models import ASRModel # NeMo ASRモデル
+import subprocess
+import shutil
 # --- グローバル設定 ---
 MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
         audio_name_stem = audio_file_path.stem
         print(f"  音声ファイルをロード中: {original_path_name}")
+        # まずffprobeで音声長を取得（4GB制限なし）
+        duration_sec = get_audio_duration_with_ffprobe(audio_path_str)
+        if duration_sec is None:
+            print("エラー: ffprobeで音声長の取得に失敗しました")
+            return None, None, None
         print(f"  音声長: {duration_sec:.2f} 秒")
+        # ファイルサイズをチェック
+        file_size = Path(audio_path_str).stat().st_size
+        file_size_gb = file_size / (1024**3)
+        print(f"  ファイルサイズ: {file_size_gb:.2f} GB")
+        # 4GB以上またはVERY_LONG_AUDIO_THRESHOLD_SECONDS以上の場合は直接ffmpegでチャンク分割
+        if file_size > 4 * 1024**3 or duration_sec > VERY_LONG_AUDIO_THRESHOLD_SECONDS:
+            print(f"  大容量ファイル（{file_size_gb:.2f}GB, {duration_sec/3600:.2f}時間）のため、ffmpegで直接チャンク分割処理を行います。")
+            return audio_path_str, f"{original_path_name} (大容量)", duration_sec
+        # 4GB未満の場合は従来のpydub処理
+        try:
+            audio = AudioSegment.from_file(audio_path_str)
+        except Exception as pydub_e:
+            if "4GB" in str(pydub_e) or "Unable to process" in str(pydub_e):
+                print(f"  pydubで4GB制限エラー。ffmpegで処理します: {pydub_e}")
+                return audio_path_str, f"{original_path_name} (大容量)", duration_sec
+            else:
+                raise pydub_e
     except FileNotFoundError:
         print(f"エラー: 音声ファイルが見つかりません: {audio_path_str}")
         return None, None, None
+    except Exception as load_e:
         print(f"エラー: 音声ファイル '{original_path_name}' のロード/デコードに失敗しました: {load_e}")
         return None, None, None
     elif audio.channels == 1:
         print("  音声は既にモノラルです。")
+    processed_temp_file_path_obj = None
     # 前処理が行われた場合、一時ファイルに保存
     if resampled or mono_converted:
         try:
+            # ファイル名から特殊文字を除去してより安全な名前を生成
+            import re
+            safe_stem = re.sub(r'[^\w\-_\.]', '_', audio_name_stem)
+            temp_suffix = "_preprocessed_temp.wav"
+            processed_temp_file_path_obj = Path(output_dir_for_temp_files, f"{safe_stem}{temp_suffix}")
             print(f"  前処理済み音声の一時保存先: {processed_temp_file_path_obj.name}")
             audio.export(processed_temp_file_path_obj, format="wav")
+            path_for_transcription = processed_temp_file_path_obj.as_posix()
             display_name_for_info = f"{original_path_name} (前処理済み)"
         except Exception as export_e:
             print(f"エラー: 前処理済み音声のエクスポートに失敗しました: {export_e}")
             if processed_temp_file_path_obj and processed_temp_file_path_obj.exists():
+                try:
+                    os.remove(processed_temp_file_path_obj)
+                except OSError:
+                    pass
             return None, None, None
     else:
         # 前処理が不要だった場合
     return path_for_transcription, display_name_for_info, duration_sec
+def get_audio_duration_with_ffprobe(audio_path_str: str) -> Optional[float]:
+    """ffprobeを使用して音声ファイルの長さを取得（4GB制限なし）"""
+    try:
+        # ffprobeが利用可能かチェック
+        if not shutil.which('ffprobe'):
+            print("警告: ffprobeが見つかりません。pydubでの処理を試行します。")
+            return None
+        cmd = [
+            'ffprobe', '-v', 'quiet', '-show_entries', 'format=duration',
+            '-of', 'csv=p=0', audio_path_str
+        ]
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
+        if result.returncode == 0 and result.stdout.strip():
+            duration = float(result.stdout.strip())
+            return duration
+        else:
+            print(f"ffprobeエラー: {result.stderr}")
+            return None
+    except subprocess.TimeoutExpired:
+        print("ffprobeがタイムアウトしました")
+        return None
+    except Exception as e:
+        print(f"ffprobeでの音声長取得エラー: {e}")
+        return None
 # --- 文字起こしコア関数 ---
 def transcribe_audio_cli(
     overlap_sec: int = CHUNK_OVERLAP_SECONDS
 ) -> List[str]:
     print(f"  音声分割中: 基本チャンク長 {chunk_length_sec}s, オーバーラップ {overlap_sec}s")
+    # ファイルサイズをチェックして処理方法を決定
+    file_size = Path(audio_path_str).stat().st_size
+    file_size_gb = file_size / (1024**3)
+    # 4GB以上の場合はffmpegを使用
+    if file_size > 4 * 1024**3:
+        print(f"  大容量ファイル（{file_size_gb:.2f}GB）のため、ffmpegで分割処理を実行します。")
+        return split_audio_with_ffmpeg(audio_path_str, output_dir_for_chunks, chunk_length_sec, overlap_sec)
+    # 4GB未満の場合は従来のpydub処理
+    try:
+        audio = AudioSegment.from_file(audio_path_str)
     except Exception as e:
+        if "4GB" in str(e) or "Unable to process" in str(e):
+            print(f"  pydubで4GB制限エラー。ffmpegで処理します: {e}")
+            return split_audio_with_ffmpeg(audio_path_str, output_dir_for_chunks, chunk_length_sec, overlap_sec)
+        else:
+            print(f"  エラー: 音声ファイル '{Path(audio_path_str).name}' のロード中にエラー（分割処理）: {e}")
+            return []
+    # 以下は既存のpydub処理...
     duration_ms = len(audio); chunk_length_ms = chunk_length_sec * 1000; overlap_ms = overlap_sec * 1000
     chunk_paths_list: List[str] = []; start_ms = 0; chunk_idx = 0
     audio_file_stem = Path(audio_path_str).stem
     print(f"  音声を {len(chunk_paths_list)} 個のチャンクに分割しました。")
     return chunk_paths_list
+def split_audio_with_ffmpeg(
+    audio_path_str: str,
+    output_dir_for_chunks: str,
+    chunk_length_sec: int,
+    overlap_sec: int
+) -> List[str]:
+    """ffmpegを使用して大容量ファイルを分割"""
+    try:
+        if not shutil.which('ffmpeg'):
+            print("エラー: ffmpegが見つかりません。4GB以上のファイルを処理するにはffmpegが必要です。")
+            return []
+        # 音声長を取得
+        duration_sec = get_audio_duration_with_ffprobe(audio_path_str)
+        if duration_sec is None:
+            print("エラー: ffmpegでの分割処理で音声長を取得できませんでした")
+            return []
+        chunk_paths_list: List[str] = []
+        audio_file_stem = Path(audio_path_str).stem
+        start_sec = 0
+        chunk_idx = 0
+        while start_sec < duration_sec:
+            # チャンク開始・終了時刻を計算
+            actual_start_sec = max(0, start_sec - (overlap_sec if start_sec > 0 else 0))
+            base_end_sec = start_sec + chunk_length_sec
+            actual_end_sec = min(base_end_sec + (overlap_sec if base_end_sec < duration_sec else 0), duration_sec)
+            if actual_start_sec >= actual_end_sec:
+                break
+            chunk_duration = actual_end_sec - actual_start_sec
+            chunk_file_name = f"{audio_file_stem}_chunk_{chunk_idx:03d}_temp.wav"
+            chunk_file_path = Path(output_dir_for_chunks) / chunk_file_name
+            # ffmpegコマンドで音声を抽出・変換
+            cmd = [
+                'ffmpeg', '-y', '-loglevel', 'error',
+                '-ss', str(actual_start_sec),
+                '-i', audio_path_str,
+                '-t', str(chunk_duration),
+                '-acodec', 'pcm_s16le',
+                '-ar', str(TARGET_SAMPLE_RATE),
+                '-ac', '1',  # モノラル
+                str(chunk_file_path)
+            ]
+            try:
+                result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
+                if result.returncode == 0:
+                    chunk_paths_list.append(chunk_file_path.as_posix())
+                    print(f"    チャンク {chunk_idx+1}: {actual_start_sec:.1f}s - {actual_end_sec:.1f}s -> {chunk_file_name}")
+                else:
+                    print(f"  エラー: チャンク {chunk_idx} の生成に失敗: {result.stderr}")
+            except subprocess.TimeoutExpired:
+                print(f"  エラー: チャンク {chunk_idx} の生成がタイムアウトしました")
+            start_sec += chunk_length_sec
+            chunk_idx += 1
+        print(f"  ffmpegで音声を {len(chunk_paths_list)} 個のチャンクに分割しました。")
+        return chunk_paths_list
+    except Exception as e:
+        print(f"  エラー: ffmpegでの音声分割中にエラー: {e}")
+        return []
 # --- 単一ファイル処理のメインロジック ---
 def process_single_file(
     input_file_path_obj: Path,