Commit
·
a89d03e
1
Parent(s):
2311e41
ffmpegを使用して大容量音声ファイルの処理を改善し、音声長の取得機能を追加
Browse files- .dockerignore +37 -0
- Dockerfile +26 -0
- transcribe_cli.py +157 -13
.dockerignore
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Git
|
| 2 |
+
.git
|
| 3 |
+
.gitignore
|
| 4 |
+
|
| 5 |
+
# Python
|
| 6 |
+
__pycache__
|
| 7 |
+
*.pyc
|
| 8 |
+
*.pyo
|
| 9 |
+
*.pyd
|
| 10 |
+
.Python
|
| 11 |
+
env
|
| 12 |
+
pip-log.txt
|
| 13 |
+
pip-delete-this-directory.txt
|
| 14 |
+
.tox
|
| 15 |
+
.coverage
|
| 16 |
+
.coverage.*
|
| 17 |
+
.cache
|
| 18 |
+
nosetests.xml
|
| 19 |
+
coverage.xml
|
| 20 |
+
*.cover
|
| 21 |
+
*.log
|
| 22 |
+
|
| 23 |
+
# Virtual Environment
|
| 24 |
+
.env
|
| 25 |
+
.venv
|
| 26 |
+
venv/
|
| 27 |
+
ENV/
|
| 28 |
+
|
| 29 |
+
# Output directories
|
| 30 |
+
outputs/
|
| 31 |
+
data/
|
| 32 |
+
|
| 33 |
+
# IDE specific files
|
| 34 |
+
.idea
|
| 35 |
+
.vscode
|
| 36 |
+
*.swp
|
| 37 |
+
*.swo
|
Dockerfile
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use NVIDIA CUDA base image
|
| 2 |
+
FROM nvidia/cuda:11.8.0-runtime-ubuntu22.04
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies
|
| 8 |
+
RUN apt-get update && apt-get install -y \
|
| 9 |
+
python3 \
|
| 10 |
+
python3-pip \
|
| 11 |
+
git \
|
| 12 |
+
ffmpeg \
|
| 13 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 14 |
+
|
| 15 |
+
# Copy only the necessary files
|
| 16 |
+
COPY requirements.txt .
|
| 17 |
+
COPY transcribe_cli.py .
|
| 18 |
+
|
| 19 |
+
# Install Python dependencies
|
| 20 |
+
RUN pip3 install --no-cache-dir -r requirements.txt
|
| 21 |
+
|
| 22 |
+
# Set environment variable for CUDA device
|
| 23 |
+
ENV CUDA_VISIBLE_DEVICES=0
|
| 24 |
+
|
| 25 |
+
# Set default command
|
| 26 |
+
ENTRYPOINT ["python3", "transcribe_cli.py", "/app/data"]
|
transcribe_cli.py
CHANGED
|
@@ -13,6 +13,8 @@ import argparse
|
|
| 13 |
import time # ★処理時間計測のために追加
|
| 14 |
import sys # ★コマンドライン引数チェックのために追加
|
| 15 |
from nemo.collections.asr.models import ASRModel # NeMo ASRモデル
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# --- グローバル設定 ---
|
| 18 |
MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
|
|
@@ -42,14 +44,39 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
|
|
| 42 |
audio_name_stem = audio_file_path.stem
|
| 43 |
|
| 44 |
print(f" 音声ファイルをロード中: {original_path_name}")
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
print(f" 音声長: {duration_sec:.2f} 秒")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
except FileNotFoundError:
|
| 50 |
print(f"エラー: 音声ファイルが見つかりません: {audio_path_str}")
|
| 51 |
return None, None, None
|
| 52 |
-
except Exception as load_e:
|
| 53 |
print(f"エラー: 音声ファイル '{original_path_name}' のロード/デコードに失敗しました: {load_e}")
|
| 54 |
return None, None, None
|
| 55 |
|
|
@@ -81,24 +108,28 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
|
|
| 81 |
elif audio.channels == 1:
|
| 82 |
print(" 音声は既にモノラルです。")
|
| 83 |
|
| 84 |
-
processed_temp_file_path_obj = None
|
| 85 |
# 前処理が行われた場合、一時ファイルに保存
|
| 86 |
if resampled or mono_converted:
|
| 87 |
try:
|
| 88 |
-
|
| 89 |
-
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
print(f" 前処理済み音声の一時保存先: {processed_temp_file_path_obj.name}")
|
| 92 |
audio.export(processed_temp_file_path_obj, format="wav")
|
| 93 |
|
| 94 |
-
path_for_transcription = processed_temp_file_path_obj.as_posix()
|
| 95 |
display_name_for_info = f"{original_path_name} (前処理済み)"
|
| 96 |
except Exception as export_e:
|
| 97 |
print(f"エラー: 前処理済み音声のエクスポートに失敗しました: {export_e}")
|
| 98 |
-
# エクスポート失敗時、もしファイルが作られていたら削除試行
|
| 99 |
if processed_temp_file_path_obj and processed_temp_file_path_obj.exists():
|
| 100 |
-
try:
|
| 101 |
-
|
|
|
|
|
|
|
| 102 |
return None, None, None
|
| 103 |
else:
|
| 104 |
# 前処理が不要だった場合
|
|
@@ -108,6 +139,33 @@ def preprocess_audio_cli(audio_path_str: str, output_dir_for_temp_files: str) ->
|
|
| 108 |
|
| 109 |
return path_for_transcription, display_name_for_info, duration_sec
|
| 110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
|
| 112 |
# --- 文字起こしコア関数 ---
|
| 113 |
def transcribe_audio_cli(
|
|
@@ -373,10 +431,28 @@ def split_audio_with_overlap_cli(
|
|
| 373 |
overlap_sec: int = CHUNK_OVERLAP_SECONDS
|
| 374 |
) -> List[str]:
|
| 375 |
print(f" 音声分割中: 基本チャンク長 {chunk_length_sec}s, オーバーラップ {overlap_sec}s")
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
except Exception as e:
|
| 378 |
-
|
| 379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
duration_ms = len(audio); chunk_length_ms = chunk_length_sec * 1000; overlap_ms = overlap_sec * 1000
|
| 381 |
chunk_paths_list: List[str] = []; start_ms = 0; chunk_idx = 0
|
| 382 |
audio_file_stem = Path(audio_path_str).stem
|
|
@@ -400,6 +476,74 @@ def split_audio_with_overlap_cli(
|
|
| 400 |
print(f" 音声を {len(chunk_paths_list)} 個のチャンクに分割しました。")
|
| 401 |
return chunk_paths_list
|
| 402 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 403 |
# --- 単一ファイル処理のメインロジック ---
|
| 404 |
def process_single_file(
|
| 405 |
input_file_path_obj: Path,
|
|
|
|
| 13 |
import time # ★処理時間計測のために追加
|
| 14 |
import sys # ★コマンドライン引数チェックのために追加
|
| 15 |
from nemo.collections.asr.models import ASRModel # NeMo ASRモデル
|
| 16 |
+
import subprocess
|
| 17 |
+
import shutil
|
| 18 |
|
| 19 |
# --- グローバル設定 ---
|
| 20 |
MODEL_NAME = "nvidia/parakeet-tdt-0.6b-v2"
|
|
|
|
| 44 |
audio_name_stem = audio_file_path.stem
|
| 45 |
|
| 46 |
print(f" 音声ファイルをロード中: {original_path_name}")
|
| 47 |
+
|
| 48 |
+
# まずffprobeで音声長を取得(4GB制限なし)
|
| 49 |
+
duration_sec = get_audio_duration_with_ffprobe(audio_path_str)
|
| 50 |
+
if duration_sec is None:
|
| 51 |
+
print("エラー: ffprobeで音声長の取得に失敗しました")
|
| 52 |
+
return None, None, None
|
| 53 |
+
|
| 54 |
print(f" 音声長: {duration_sec:.2f} 秒")
|
| 55 |
+
|
| 56 |
+
# ファイルサイズをチェック
|
| 57 |
+
file_size = Path(audio_path_str).stat().st_size
|
| 58 |
+
file_size_gb = file_size / (1024**3)
|
| 59 |
+
print(f" ファイルサイズ: {file_size_gb:.2f} GB")
|
| 60 |
+
|
| 61 |
+
# 4GB以上またはVERY_LONG_AUDIO_THRESHOLD_SECONDS以上の場合は直接ffmpegでチャンク分割
|
| 62 |
+
if file_size > 4 * 1024**3 or duration_sec > VERY_LONG_AUDIO_THRESHOLD_SECONDS:
|
| 63 |
+
print(f" 大容量ファイル({file_size_gb:.2f}GB, {duration_sec/3600:.2f}時間)のため、ffmpegで直接チャンク分割処理を行います。")
|
| 64 |
+
return audio_path_str, f"{original_path_name} (大容量)", duration_sec
|
| 65 |
+
|
| 66 |
+
# 4GB未満の場合は従来のpydub処理
|
| 67 |
+
try:
|
| 68 |
+
audio = AudioSegment.from_file(audio_path_str)
|
| 69 |
+
except Exception as pydub_e:
|
| 70 |
+
if "4GB" in str(pydub_e) or "Unable to process" in str(pydub_e):
|
| 71 |
+
print(f" pydubで4GB制限エラー。ffmpegで処理します: {pydub_e}")
|
| 72 |
+
return audio_path_str, f"{original_path_name} (大容量)", duration_sec
|
| 73 |
+
else:
|
| 74 |
+
raise pydub_e
|
| 75 |
|
| 76 |
except FileNotFoundError:
|
| 77 |
print(f"エラー: 音声ファイルが見つかりません: {audio_path_str}")
|
| 78 |
return None, None, None
|
| 79 |
+
except Exception as load_e:
|
| 80 |
print(f"エラー: 音声ファイル '{original_path_name}' のロード/デコードに失敗しました: {load_e}")
|
| 81 |
return None, None, None
|
| 82 |
|
|
|
|
| 108 |
elif audio.channels == 1:
|
| 109 |
print(" 音声は既にモノラルです。")
|
| 110 |
|
| 111 |
+
processed_temp_file_path_obj = None
|
| 112 |
# 前処理が行われた場合、一時ファイルに保存
|
| 113 |
if resampled or mono_converted:
|
| 114 |
try:
|
| 115 |
+
# ファイル名から特殊文字を除去してより安全な名前を生成
|
| 116 |
+
import re
|
| 117 |
+
safe_stem = re.sub(r'[^\w\-_\.]', '_', audio_name_stem)
|
| 118 |
+
temp_suffix = "_preprocessed_temp.wav"
|
| 119 |
+
processed_temp_file_path_obj = Path(output_dir_for_temp_files, f"{safe_stem}{temp_suffix}")
|
| 120 |
|
| 121 |
print(f" 前処理済み音声の一時保存先: {processed_temp_file_path_obj.name}")
|
| 122 |
audio.export(processed_temp_file_path_obj, format="wav")
|
| 123 |
|
| 124 |
+
path_for_transcription = processed_temp_file_path_obj.as_posix()
|
| 125 |
display_name_for_info = f"{original_path_name} (前処理済み)"
|
| 126 |
except Exception as export_e:
|
| 127 |
print(f"エラー: 前処理済み音声のエクスポートに失敗しました: {export_e}")
|
|
|
|
| 128 |
if processed_temp_file_path_obj and processed_temp_file_path_obj.exists():
|
| 129 |
+
try:
|
| 130 |
+
os.remove(processed_temp_file_path_obj)
|
| 131 |
+
except OSError:
|
| 132 |
+
pass
|
| 133 |
return None, None, None
|
| 134 |
else:
|
| 135 |
# 前処理が不要だった場合
|
|
|
|
| 139 |
|
| 140 |
return path_for_transcription, display_name_for_info, duration_sec
|
| 141 |
|
| 142 |
+
def get_audio_duration_with_ffprobe(audio_path_str: str) -> Optional[float]:
|
| 143 |
+
"""ffprobeを使用して音声ファイルの長さを取得(4GB制限なし)"""
|
| 144 |
+
try:
|
| 145 |
+
# ffprobeが利用可能かチェック
|
| 146 |
+
if not shutil.which('ffprobe'):
|
| 147 |
+
print("警告: ffprobeが見つかりません。pydubでの処理を試行します。")
|
| 148 |
+
return None
|
| 149 |
+
|
| 150 |
+
cmd = [
|
| 151 |
+
'ffprobe', '-v', 'quiet', '-show_entries', 'format=duration',
|
| 152 |
+
'-of', 'csv=p=0', audio_path_str
|
| 153 |
+
]
|
| 154 |
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=30)
|
| 155 |
+
|
| 156 |
+
if result.returncode == 0 and result.stdout.strip():
|
| 157 |
+
duration = float(result.stdout.strip())
|
| 158 |
+
return duration
|
| 159 |
+
else:
|
| 160 |
+
print(f"ffprobeエラー: {result.stderr}")
|
| 161 |
+
return None
|
| 162 |
+
|
| 163 |
+
except subprocess.TimeoutExpired:
|
| 164 |
+
print("ffprobeがタイムアウトしました")
|
| 165 |
+
return None
|
| 166 |
+
except Exception as e:
|
| 167 |
+
print(f"ffprobeでの音声長取得エラー: {e}")
|
| 168 |
+
return None
|
| 169 |
|
| 170 |
# --- 文字起こしコア関数 ---
|
| 171 |
def transcribe_audio_cli(
|
|
|
|
| 431 |
overlap_sec: int = CHUNK_OVERLAP_SECONDS
|
| 432 |
) -> List[str]:
|
| 433 |
print(f" 音声分割中: 基本チャンク長 {chunk_length_sec}s, オーバーラップ {overlap_sec}s")
|
| 434 |
+
|
| 435 |
+
# ファイルサイズをチェックして処理方法を決定
|
| 436 |
+
file_size = Path(audio_path_str).stat().st_size
|
| 437 |
+
file_size_gb = file_size / (1024**3)
|
| 438 |
+
|
| 439 |
+
# 4GB以上の場合はffmpegを使用
|
| 440 |
+
if file_size > 4 * 1024**3:
|
| 441 |
+
print(f" 大容量ファイル({file_size_gb:.2f}GB)のため、ffmpegで分割処理を実行します。")
|
| 442 |
+
return split_audio_with_ffmpeg(audio_path_str, output_dir_for_chunks, chunk_length_sec, overlap_sec)
|
| 443 |
+
|
| 444 |
+
# 4GB未満の場合は従来のpydub処理
|
| 445 |
+
try:
|
| 446 |
+
audio = AudioSegment.from_file(audio_path_str)
|
| 447 |
except Exception as e:
|
| 448 |
+
if "4GB" in str(e) or "Unable to process" in str(e):
|
| 449 |
+
print(f" pydubで4GB制限エラー。ffmpegで処理します: {e}")
|
| 450 |
+
return split_audio_with_ffmpeg(audio_path_str, output_dir_for_chunks, chunk_length_sec, overlap_sec)
|
| 451 |
+
else:
|
| 452 |
+
print(f" エラー: 音声ファイル '{Path(audio_path_str).name}' のロード中にエラー(分割処理): {e}")
|
| 453 |
+
return []
|
| 454 |
+
|
| 455 |
+
# 以下は既存のpydub処理...
|
| 456 |
duration_ms = len(audio); chunk_length_ms = chunk_length_sec * 1000; overlap_ms = overlap_sec * 1000
|
| 457 |
chunk_paths_list: List[str] = []; start_ms = 0; chunk_idx = 0
|
| 458 |
audio_file_stem = Path(audio_path_str).stem
|
|
|
|
| 476 |
print(f" 音声を {len(chunk_paths_list)} 個のチャンクに分割しました。")
|
| 477 |
return chunk_paths_list
|
| 478 |
|
| 479 |
+
def split_audio_with_ffmpeg(
|
| 480 |
+
audio_path_str: str,
|
| 481 |
+
output_dir_for_chunks: str,
|
| 482 |
+
chunk_length_sec: int,
|
| 483 |
+
overlap_sec: int
|
| 484 |
+
) -> List[str]:
|
| 485 |
+
"""ffmpegを使用して大容量ファイルを分割"""
|
| 486 |
+
try:
|
| 487 |
+
if not shutil.which('ffmpeg'):
|
| 488 |
+
print("エラー: ffmpegが見つかりません。4GB以上のファイルを処理するにはffmpegが必要です。")
|
| 489 |
+
return []
|
| 490 |
+
|
| 491 |
+
# 音声長を取得
|
| 492 |
+
duration_sec = get_audio_duration_with_ffprobe(audio_path_str)
|
| 493 |
+
if duration_sec is None:
|
| 494 |
+
print("エラー: ffmpegでの分割処理で音声長を取得できませんでした")
|
| 495 |
+
return []
|
| 496 |
+
|
| 497 |
+
chunk_paths_list: List[str] = []
|
| 498 |
+
audio_file_stem = Path(audio_path_str).stem
|
| 499 |
+
start_sec = 0
|
| 500 |
+
chunk_idx = 0
|
| 501 |
+
|
| 502 |
+
while start_sec < duration_sec:
|
| 503 |
+
# チャンク開始・終了時刻を計算
|
| 504 |
+
actual_start_sec = max(0, start_sec - (overlap_sec if start_sec > 0 else 0))
|
| 505 |
+
base_end_sec = start_sec + chunk_length_sec
|
| 506 |
+
actual_end_sec = min(base_end_sec + (overlap_sec if base_end_sec < duration_sec else 0), duration_sec)
|
| 507 |
+
|
| 508 |
+
if actual_start_sec >= actual_end_sec:
|
| 509 |
+
break
|
| 510 |
+
|
| 511 |
+
chunk_duration = actual_end_sec - actual_start_sec
|
| 512 |
+
chunk_file_name = f"{audio_file_stem}_chunk_{chunk_idx:03d}_temp.wav"
|
| 513 |
+
chunk_file_path = Path(output_dir_for_chunks) / chunk_file_name
|
| 514 |
+
|
| 515 |
+
# ffmpegコマンドで音声を抽出・変換
|
| 516 |
+
cmd = [
|
| 517 |
+
'ffmpeg', '-y', '-loglevel', 'error',
|
| 518 |
+
'-ss', str(actual_start_sec),
|
| 519 |
+
'-i', audio_path_str,
|
| 520 |
+
'-t', str(chunk_duration),
|
| 521 |
+
'-acodec', 'pcm_s16le',
|
| 522 |
+
'-ar', str(TARGET_SAMPLE_RATE),
|
| 523 |
+
'-ac', '1', # モノラル
|
| 524 |
+
str(chunk_file_path)
|
| 525 |
+
]
|
| 526 |
+
|
| 527 |
+
try:
|
| 528 |
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=300)
|
| 529 |
+
if result.returncode == 0:
|
| 530 |
+
chunk_paths_list.append(chunk_file_path.as_posix())
|
| 531 |
+
print(f" チャンク {chunk_idx+1}: {actual_start_sec:.1f}s - {actual_end_sec:.1f}s -> {chunk_file_name}")
|
| 532 |
+
else:
|
| 533 |
+
print(f" エラー: チャンク {chunk_idx} の生成に失敗: {result.stderr}")
|
| 534 |
+
except subprocess.TimeoutExpired:
|
| 535 |
+
print(f" エラー: チャンク {chunk_idx} の生成がタイムアウトしました")
|
| 536 |
+
|
| 537 |
+
start_sec += chunk_length_sec
|
| 538 |
+
chunk_idx += 1
|
| 539 |
+
|
| 540 |
+
print(f" ffmpegで音声を {len(chunk_paths_list)} 個のチャンクに分割しました。")
|
| 541 |
+
return chunk_paths_list
|
| 542 |
+
|
| 543 |
+
except Exception as e:
|
| 544 |
+
print(f" エラー: ffmpegでの音声分割中にエラー: {e}")
|
| 545 |
+
return []
|
| 546 |
+
|
| 547 |
# --- 単一ファイル処理のメインロジック ---
|
| 548 |
def process_single_file(
|
| 549 |
input_file_path_obj: Path,
|