|
|
import json |
|
|
import sys |
|
|
import os |
|
|
from tqdm import tqdm |
|
|
|
|
|
def load_jsonl_set(path): |
|
|
"""加载jsonl文件,返回以audios字段为key的dict""" |
|
|
data_dict = {} |
|
|
with open(path, 'r', encoding='utf-8') as f: |
|
|
for line in f: |
|
|
try: |
|
|
item = json.loads(line.strip()) |
|
|
|
|
|
key = tuple(item.get('audios', [])) |
|
|
data_dict[key] = item |
|
|
except Exception as e: |
|
|
continue |
|
|
return data_dict |
|
|
|
|
|
def main(file1, file2, output_path): |
|
|
dict1 = load_jsonl_set(file1) |
|
|
dict2 = load_jsonl_set(file2) |
|
|
|
|
|
|
|
|
common_keys = set(dict1.keys()) & set(dict2.keys()) |
|
|
print(f"交集样本数: {len(common_keys)}") |
|
|
|
|
|
with open(output_path, 'w', encoding='utf-8') as out: |
|
|
for key in tqdm(common_keys, desc="写入交集"): |
|
|
|
|
|
out.write(json.dumps(dict1[key], ensure_ascii=False) + '\n') |
|
|
|
|
|
print(f"交集已保存到: {output_path}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
if len(sys.argv) != 4: |
|
|
print("用法: python intersect_jsonl.py file1.jsonl file2.jsonl output.jsonl") |
|
|
sys.exit(1) |
|
|
main(sys.argv[1], sys.argv[2], sys.argv[3]) |
|
|
|