import json import sys import os from tqdm import tqdm def load_jsonl_set(path): """加载jsonl文件,返回以audios字段为key的dict""" data_dict = {} with open(path, 'r', encoding='utf-8') as f: for line in f: try: item = json.loads(line.strip()) # 用tuple(audios)做key,保证唯一性 key = tuple(item.get('audios', [])) data_dict[key] = item except Exception as e: continue return data_dict def main(file1, file2, output_path): dict1 = load_jsonl_set(file1) dict2 = load_jsonl_set(file2) # 取交集 common_keys = set(dict1.keys()) & set(dict2.keys()) print(f"交集样本数: {len(common_keys)}") with open(output_path, 'w', encoding='utf-8') as out: for key in tqdm(common_keys, desc="写入交集"): # 以file1的内容为准 out.write(json.dumps(dict1[key], ensure_ascii=False) + '\n') print(f"交集已保存到: {output_path}") if __name__ == "__main__": if len(sys.argv) != 4: print("用法: python intersect_jsonl.py file1.jsonl file2.jsonl output.jsonl") sys.exit(1) main(sys.argv[1], sys.argv[2], sys.argv[3])