|
|
import json |
|
|
import torchaudio |
|
|
from tqdm import tqdm |
|
|
import os |
|
|
import sys |
|
|
from collections import defaultdict |
|
|
|
|
|
def validate_jsonl_audios(jsonl_path): |
|
|
"""验证JSONL文件中所有音频文件的完整性""" |
|
|
stats = defaultdict(int) |
|
|
error_log = [] |
|
|
valid_samples = 0 |
|
|
|
|
|
|
|
|
with open(jsonl_path, 'r') as f: |
|
|
total_lines = sum(1 for _ in f) |
|
|
|
|
|
|
|
|
with open(jsonl_path, 'r') as f: |
|
|
for line_num, line in enumerate(tqdm(f, total=total_lines, desc="验证进度", unit="line")): |
|
|
try: |
|
|
data = json.loads(line.strip()) |
|
|
if 'audios' not in data or not data['audios']: |
|
|
stats['no_audio_field'] += 1 |
|
|
continue |
|
|
|
|
|
for audio_path in data['audios']: |
|
|
|
|
|
if not os.path.exists(audio_path): |
|
|
stats['missing'] += 1 |
|
|
error_log.append(f"[行{line_num+1}] 缺失文件: {audio_path}") |
|
|
continue |
|
|
|
|
|
|
|
|
if os.path.getsize(audio_path) == 0: |
|
|
stats['zero_size'] += 1 |
|
|
error_log.append(f"[行{line_num+1}] 空文件: {audio_path}") |
|
|
continue |
|
|
|
|
|
|
|
|
try: |
|
|
waveform, sr = torchaudio.load(audio_path) |
|
|
if waveform.numel() == 0: |
|
|
stats['empty_audio'] += 1 |
|
|
error_log.append(f"[行{line_num+1}] 空音频: {audio_path}") |
|
|
elif sr not in [8000, 16000, 22050, 44100, 48000]: |
|
|
stats['abnormal_sr'] += 1 |
|
|
error_log.append(f"[行{line_num+1}] 异常采样率({sr}Hz): {audio_path}") |
|
|
else: |
|
|
stats['valid'] += 1 |
|
|
except Exception as e: |
|
|
stats['corrupted'] += 1 |
|
|
error_type = str(e).split('(')[0] |
|
|
error_log.append(f"[行{line_num+1}] 损坏文件({error_type}): {audio_path}") |
|
|
|
|
|
valid_samples += 1 |
|
|
|
|
|
except json.JSONDecodeError: |
|
|
stats['invalid_json'] += 1 |
|
|
error_log.append(f"[行{line_num+1}] 无效JSON格式") |
|
|
|
|
|
|
|
|
print("\n===== 验证报告 =====") |
|
|
print(f"总行数: {total_lines}") |
|
|
print(f"有效样本: {valid_samples}") |
|
|
print("--- 问题统计 ---") |
|
|
for k, v in sorted(stats.items()): |
|
|
print(f"{k}: {v}") |
|
|
|
|
|
|
|
|
if error_log: |
|
|
log_file = f"{os.path.splitext(jsonl_path)[0]}_audio_errors.log" |
|
|
with open(log_file, 'w') as f: |
|
|
f.write("\n".join(error_log)) |
|
|
print(f"\n发现 {len(error_log)} 个问题,已保存到 {log_file}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
if len(sys.argv) != 2: |
|
|
print("使用方法: python validate_audio_jsonl.py <input.jsonl>") |
|
|
sys.exit(1) |
|
|
|
|
|
if not os.path.exists(sys.argv[1]): |
|
|
print(f"错误: 文件 {sys.argv[1]} 不存在") |
|
|
sys.exit(1) |
|
|
|
|
|
validate_jsonl_audios(sys.argv[1]) |