interactSpeech / .ipynb_checkpoints /checkMissing-checkpoint.py
Student0809's picture
Add files using upload-large-folder tool
e791fa3 verified
raw
history blame
3.31 kB
import json
import torchaudio
from tqdm import tqdm
import os
import sys
from collections import defaultdict
def validate_jsonl_audios(jsonl_path):
"""验证JSONL文件中所有音频文件的完整性"""
stats = defaultdict(int)
error_log = []
valid_samples = 0
# 第一次遍历:统计总行数(用于进度条)
with open(jsonl_path, 'r') as f:
total_lines = sum(1 for _ in f)
# 第二次遍历:实际验证
with open(jsonl_path, 'r') as f:
for line_num, line in enumerate(tqdm(f, total=total_lines, desc="验证进度", unit="line")):
try:
data = json.loads(line.strip())
if 'audios' not in data or not data['audios']:
stats['no_audio_field'] += 1
continue
for audio_path in data['audios']:
# 检查文件是否存在
if not os.path.exists(audio_path):
stats['missing'] += 1
error_log.append(f"[行{line_num+1}] 缺失文件: {audio_path}")
continue
# 检查文件大小
if os.path.getsize(audio_path) == 0:
stats['zero_size'] += 1
error_log.append(f"[行{line_num+1}] 空文件: {audio_path}")
continue
# 验证音频内容
try:
waveform, sr = torchaudio.load(audio_path)
if waveform.numel() == 0:
stats['empty_audio'] += 1
error_log.append(f"[行{line_num+1}] 空音频: {audio_path}")
elif sr not in [8000, 16000, 22050, 44100, 48000]:
stats['abnormal_sr'] += 1
error_log.append(f"[行{line_num+1}] 异常采样率({sr}Hz): {audio_path}")
else:
stats['valid'] += 1
except Exception as e:
stats['corrupted'] += 1
error_type = str(e).split('(')[0]
error_log.append(f"[行{line_num+1}] 损坏文件({error_type}): {audio_path}")
valid_samples += 1
except json.JSONDecodeError:
stats['invalid_json'] += 1
error_log.append(f"[行{line_num+1}] 无效JSON格式")
# 打印统计报告
print("\n===== 验证报告 =====")
print(f"总行数: {total_lines}")
print(f"有效样本: {valid_samples}")
print("--- 问题统计 ---")
for k, v in sorted(stats.items()):
print(f"{k}: {v}")
# 保存错误日志
if error_log:
log_file = f"{os.path.splitext(jsonl_path)[0]}_audio_errors.log"
with open(log_file, 'w') as f:
f.write("\n".join(error_log))
print(f"\n发现 {len(error_log)} 个问题,已保存到 {log_file}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("使用方法: python validate_audio_jsonl.py <input.jsonl>")
sys.exit(1)
if not os.path.exists(sys.argv[1]):
print(f"错误: 文件 {sys.argv[1]} 不存在")
sys.exit(1)
validate_jsonl_audios(sys.argv[1])