import os import jsonlines from concurrent.futures import ThreadPoolExecutor from common import extract_transform def run(): # remove previous executions if os.path.exists("/opt/physionet/dataset.jsonl"): os.remove("/opt/physionet/dataset.jsonl") if os.path.exists("/opt/physionet/control.jsonl"): os.remove("/opt/physionet/control.jsonl") # create a control dictionary root = "/opt/physionet/physionet.org/files/mimic-cxr/2.0.0/files" with jsonlines.open("/opt/physionet/control.jsonl","w") as writer: parts = os.listdir(root) for part in parts: patients = os.listdir(os.path.join(root,part)) for patient in patients: scan = [x for x in os.listdir(os.path.join(root,part,patient)) if x.endswith('.txt')] writer.write({"part": part, "patient": patient,"scan": scan}) # parse each record with ThreadPoolExecutor(max_workers=4) as executor: with jsonlines.open("/opt/physionet/control.jsonl","r") as reader: executor.map(extract_transform, reader) # only run it if there are files downloaded if __name__ == "__main__": try: if len(os.listdir('/opt/physionet/physionet.org/files/mimic-cxr/2.0.0/files')) > 0: run() except OSError: print("not downloaded yet")