interactSpeech / .ipynb_checkpoints /intersect_jsonl-checkpoint.py
Student0809's picture
Add files using upload-large-folder tool
e791fa3 verified
raw
history blame
1.25 kB
import json
import sys
import os
from tqdm import tqdm
def load_jsonl_set(path):
"""加载jsonl文件,返回以audios字段为key的dict"""
data_dict = {}
with open(path, 'r', encoding='utf-8') as f:
for line in f:
try:
item = json.loads(line.strip())
# 用tuple(audios)做key,保证唯一性
key = tuple(item.get('audios', []))
data_dict[key] = item
except Exception as e:
continue
return data_dict
def main(file1, file2, output_path):
dict1 = load_jsonl_set(file1)
dict2 = load_jsonl_set(file2)
# 取交集
common_keys = set(dict1.keys()) & set(dict2.keys())
print(f"交集样本数: {len(common_keys)}")
with open(output_path, 'w', encoding='utf-8') as out:
for key in tqdm(common_keys, desc="写入交集"):
# 以file1的内容为准
out.write(json.dumps(dict1[key], ensure_ascii=False) + '\n')
print(f"交集已保存到: {output_path}")
if __name__ == "__main__":
if len(sys.argv) != 4:
print("用法: python intersect_jsonl.py file1.jsonl file2.jsonl output.jsonl")
sys.exit(1)
main(sys.argv[1], sys.argv[2], sys.argv[3])