interactSpeech / .ipynb_checkpoints /intersect_jsonl-checkpoint.py

Add files using upload-large-folder tool

e791fa3 verified 4 months ago

1.25 kB

	import json
	import sys
	import os
	from tqdm import tqdm

	def load_jsonl_set(path):
	"""加载jsonl文件，返回以audios字段为key的dict"""
	data_dict = {}
	with open(path, 'r', encoding='utf-8') as f:
	for line in f:
	try:
	item = json.loads(line.strip())
	# 用tuple(audios)做key，保证唯一性
	key = tuple(item.get('audios', []))
	data_dict[key] = item
	except Exception as e:
	continue
	return data_dict

	def main(file1, file2, output_path):
	dict1 = load_jsonl_set(file1)
	dict2 = load_jsonl_set(file2)

	# 取交集
	common_keys = set(dict1.keys()) & set(dict2.keys())
	print(f"交集样本数: {len(common_keys)}")

	with open(output_path, 'w', encoding='utf-8') as out:
	for key in tqdm(common_keys, desc="写入交集"):
	# 以file1的内容为准
	out.write(json.dumps(dict1[key], ensure_ascii=False) + '\n')

	print(f"交集已保存到: {output_path}")

	if __name__ == "__main__":
	if len(sys.argv) != 4:
	print("用法: python intersect_jsonl.py file1.jsonl file2.jsonl output.jsonl")
	sys.exit(1)
	main(sys.argv[1], sys.argv[2], sys.argv[3])