Spaces:

dheeena
/

mri-report-generator

Build error

App Files Files Community

ns commited on Feb 2, 2023

Commit

0e231b3

1 Parent(s): 4827b95

containers

Browse files

Files changed (15) hide show

containers/etl/Dockerfile +37 -0
containers/etl/__init__.py +0 -0
containers/etl/common.py +119 -0
containers/etl/requirements.txt +2 -0
containers/etl/run.py +38 -0
containers/jupyter/Dockerfile +28 -0
containers/jupyter/requirements.txt +1 -0
containers/physionet/Dockerfile +11 -0
containers/physionet/entrypoint.sh +11 -0
containers/physionet/run.sh +6 -0
containers/prerad +1 -0
containers/streamlit/example.txt +16 -0
containers/train/Dockerfile +31 -0
containers/train/requirements.txt +3 -0
containers/train/run.py +83 -0

containers/etl/Dockerfile ADDED Viewed

	@@ -0,0 +1,37 @@

+FROM python:3.9-buster
+RUN \
+    apt-get update && \
+    apt-get -y upgrade && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN useradd --create-home app
+WORKDIR /home/app
+COPY requirements.txt /home/app/
+COPY __init__.py /home/app/
+COPY common.py /home/app/
+COPY run.py /home/app/
+RUN \
+    chown app:app /home/app/requirements.txt && \
+    chmod 0755 /home/app/requirements.txt && \
+    chown app:app /home/app/__init__.py && \
+    chmod 0755 /home/app/__init__.py && \
+    chown app:app /home/app/run.py && \
+    chmod 0755 /home/app/run.py && \
+    chown app:app /home/app/common.py && \
+    chmod 0755 /home/app/common.py
+USER app
+ENV VIRTUAL_ENV=/home/app/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN \
+    pip install --upgrade pip && \
+    pip install -r requirements.txt
+CMD ["python", "run.py", "worker", "-l", "info"]

containers/etl/__init__.py ADDED Viewed

File without changes

containers/etl/common.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import re
+import pandas as pd
+import os
+import random
+import jsonlines
+def flatten_json(data: dict) -> dict:
+    """ recursive flatten json elements from https://www.geeksforgeeks.org/flattening-json-objects-in-python/"""
+    out = {}
+    def flatten(x, name=""):
+        # If the Nested key-value
+        # pair is of dict type
+        if type(x) is dict:
+            for a in x:
+                flatten(x[a], name + a + "_")
+        # If the Nested key-value
+        # pair is of list type
+        elif type(x) is list:
+            i = 0
+            for a in x:
+                flatten(a, name + str(i) + "_")
+                i += 1
+        else:
+            out[name[:-1]] = x
+    flatten(data)
+    return out
+def construct_report(string: str) -> tuple:
+    # normalize sections
+    keywords = [x.replace(":","").lower() for x in re.findall("[A-Z0-9][A-Z0-9. ]*:",string)]
+    # normalize sections
+    paragraphs = re.findall("(\w+)*: *(.*?)(?=\s*(?:\w+:|$))", string.lower())
+    sections = []
+    for header, paragraph in paragraphs:
+        if header in [x.replace(" ","_").replace("/","_") for x in keywords]:
+            sections.append(":".join([header, ". ".join([x.strip() for x in paragraph.split(". ") if x])]))
+        else:
+            sections.append(" - ".join([header, ". ".join([x.strip() for x in paragraph.split(". ") if x])]))
+    sections = list(map(lambda a: a + "." if a[-1] != "." else a, sections))
+    paragraphs = re.findall("(\w+) *: *(.*?)(?=\s*(?:\w+:|$))", "  ".join(sections))
+    report = {}
+    for header, paragraph in paragraphs:
+        sentence = paragraph.replace("  ", ".  ").replace("..", ".").replace(" - ."," - ")
+        sentence = re.split(r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s", sentence)
+        sentence = [x.strip() for x in sentence if len(x) > 2]
+        report[header.replace("_", " ")] = [x.replace("_", " ") for x in sentence]
+    report = flatten_json(report)
+    topic = [x.split("_")[0] for x in report.keys()]
+    body = [x for x in report.values()]
+    report = pd.DataFrame(list(zip(topic, body)))
+    try:
+        report.columns = ["paragraph", "sentence"]
+        report["ranking"] = report.index
+        report["screen"] = report["sentence"].apply(lambda x: 1 if 'interval change' in x or 'compar' in x or 'prior' in x or 'improved from' in x else 0)
+        reason = re.sub(" +", " ", " ".join([": ".join([key, value]) for (key,value) in collapse_report(report).items() if key in ['indication','history']]))
+        text = re.sub(" +", " ", " ".join([": ".join([key, value]) for (key,value) in collapse_report(report[report.screen==0]).items() if key in ['findings','impression']]))
+        if 'findings' in text and 'impression' in text:
+            return reason, text
+        else:
+            return None, None
+    except ValueError:
+        return None, None
+# take a report dataframe and return a dictionary of the paragraphs
+def collapse_report(report: pd.DataFrame) -> dict:
+    """take raw text and return paragraphs in sections as key:value pairs"""
+    out = pd.merge(
+        report['paragraph'].drop_duplicates(),
+        report.groupby(['paragraph'])['sentence'].transform(lambda x: '  '.join(x)).drop_duplicates(),
+        left_index=True,
+        right_index=True
+    )
+    structure = dict()
+    for index, row in out.iterrows():
+        structure[row['paragraph']] = row['sentence']
+    return structure
+def extract_transform(row: dict) -> None:
+    report_root = "./physionet.org/files/mimic-cxr/2.0.0/files"
+    image_root = "./physionet.org/files/mimic-cxr-jpg/2.0.0/files"
+    try:
+        scans = os.listdir(os.path.join(image_root,row["part"],row["patient"]))
+        scans = [x for x in scans if 'txt' not in x]
+        for scan in scans:
+            report = os.path.join(report_root,row["part"],row["patient"],scan+".txt")
+            if os.path.exists(report):
+                with open(report,"r") as f:
+                    original = f.read()
+                transformed = re.sub(" +"," ",original.replace("FINAL REPORT","").strip().replace("\n \n",".").replace("\n"," ")).replace(" . "," ").replace("..",".").replace("CHEST RADIOGRAPHS."," ").strip()
+                if len(transformed) > 0:
+                    reason, text = construct_report(transformed)
+                    images = [os.path.join(image_root,row["part"],row["patient"],scan,x) for x in os.listdir(os.path.join(image_root,row["part"],row["patient"],scan))]
+                    images = [x for x in images if os.path.exists(x)]
+                    random.shuffle(images) # shuffle so we can reasonably sample 1 image per study
+                    with jsonlines.open("dataset.jsonl","a") as writer:
+                        for image in images:
+                            writer.write({
+                                "fold": row["patient"][0:3],
+                                "image": image,
+                                "study": image.split("/")[-2],
+                                "original": transformed,
+                                "report": report,
+                                "patient": row["patient"],
+                                "reason": reason,
+                                "text": " ".join([reason,text]) if reason is not None and text is not None else None
+                            })
+    except FileNotFoundError:
+        pass

containers/etl/requirements.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ jsonlines>=3.1.0,<3.2
2	+ pandas>=1.5.3,<1.6

containers/etl/run.py ADDED Viewed

	@@ -0,0 +1,38 @@

+import os
+import jsonlines
+from concurrent.futures import ThreadPoolExecutor
+from common import extract_transform
+def run():
+    # remove previous executions
+    if os.path.exists("/opt/physionet/dataset.jsonl"):
+        os.remove("/opt/physionet/dataset.jsonl")
+    if os.path.exists("/opt/physionet/control.jsonl"):
+        os.remove("/opt/physionet/control.jsonl")
+    # create a control dictionary
+    root = "/opt/physionet/physionet.org/files/mimic-cxr/2.0.0/files"
+    with jsonlines.open("/opt/physionet/control.jsonl","w") as writer:
+        parts = os.listdir(root)
+        for part in parts:
+            patients = os.listdir(os.path.join(root,part))
+            for patient in patients:
+                scan = [x for x in os.listdir(os.path.join(root,part,patient))  if x.endswith('.txt')]
+                writer.write({"part": part, "patient": patient,"scan": scan})
+    # parse each record
+    with ThreadPoolExecutor(max_workers=4) as executor:
+        with jsonlines.open("/opt/physionet/control.jsonl","r") as reader:
+            executor.map(extract_transform, reader)
+# only run it if there are files downloaded
+if __name__ == "__main__":
+    try:
+        if len(os.listdir('/opt/physionet/physionet.org/files/mimic-cxr/2.0.0/files')) > 0:
+            run()
+    except OSError:
+        print("not downloaded yet")

containers/jupyter/Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+FROM python:3.9-buster
+RUN \
+    apt-get update && \
+    apt-get -y upgrade && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN useradd --create-home app
+WORKDIR /home/app
+COPY requirements.txt /home/app/
+RUN \
+    chown app:app /home/app/requirements.txt && \
+    chmod 0755 /home/app/requirements.txt
+USER app
+ENV VIRTUAL_ENV=/home/app/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN \
+    pip install --upgrade pip && \
+    pip install -r requirements.txt
+CMD ["jupyter", "notebook", "--notebook-dir=/opt/notebooks", "--ip='*'", "--port=8888", "--no-browser"]

containers/jupyter/requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ jupyter>=1.0.0,<1.1

containers/physionet/Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM debian:buster
+RUN apt-get update -y && \
+    apt-get -y install parallel wget && \
+    apt-get -y autoclean && \
+    apt-get -y autoremove && \
+    rm -rf /var/lib/apt/lists/*
+COPY entrypoint.sh /opt/entrypoint.sh
+ENTRYPOINT ["/bin/bash", "/opt/entrypoint.sh"]

containers/physionet/entrypoint.sh ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/bin/bash
+set -e
+if [ $# -eq 0 ]
+    then
+      echo no download requested
+    else
+        cd /opt/physionet
+        wget -A .txt -r -nc -c -np --user $PHYSIONET_USER --password $PHYSIONET_PASSWORD https://physionet.org/files/mimic-cxr/2.0.0/files/
+        seq 10 19 | parallel -j4 wget -A .jpg -r -nc -c -np --user $PHYSIONET_USER --password $PHYSIONET_PASSWORD https://physionet.org/files/mimic-cxr-jpg/2.0.0/files/p{}/
+fi

containers/physionet/run.sh ADDED Viewed

	@@ -0,0 +1,6 @@

+if ${1:-false}; then
+    # get the JPG
+    # spread out over 4 cores
+    echo True
+fi

containers/prerad ADDED Viewed

	@@ -0,0 +1 @@


1	+ Subproject commit 9c1e2f3995808260287d58217a0522592f78aed0

containers/streamlit/example.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+                                FINAL REPORT
+EXAMINATION:  CHEST (PORTABLE AP)
+INDICATION:  ___ year old man with episodic ___ weakness  // r/o infection
+TECHNIQUE:  CHEST (PORTABLE AP)
+COMPARISON:  None.
+IMPRESSION:
+Heart size and mediastinum are mildly enlarged.  The patient is after median
+sternotomy and CABG.  Lung volumes are preserved.  Mild interstitial changes
+are noted bilaterally, potentially representing chronic changes but mild
+interstitial edema is a possibility.  No definitive focal consolidations to
+suggest infectious process demonstrated.  No pleural effusion or pneumothorax.

containers/train/Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.9-buster
+RUN \
+    apt-get update && \
+    apt-get -y upgrade && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+RUN useradd --create-home app
+WORKDIR /home/app
+COPY requirements.txt /home/app/
+COPY run.py /home/app/
+RUN \
+    chown app:app /home/app/requirements.txt && \
+    chmod 0755 /home/app/requirements.txt && \
+    chown app:app /home/app/run.py && \
+    chmod 0755 /home/app/run.py
+USER app
+ENV VIRTUAL_ENV=/home/app/venv
+RUN python3 -m venv $VIRTUAL_ENV
+ENV PATH="$VIRTUAL_ENV/bin:$PATH"
+RUN \
+    pip install --upgrade pip && \
+    pip install -r requirements.txt
+CMD ["python", "run.py", "worker", "-l", "info"]

containers/train/requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+transformers[torch]>=4.26.0,<4.27
+pillow>=9.4.0,<9.5
+datasets>=2.9.0,<2.10

containers/train/run.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import pandas as pd
+from datasets import Dataset, Image
+import torch
+from transformers import Trainer, TrainingArguments
+from transformers import DataCollatorForLanguageModeling
+from transformers import BlipProcessor, BlipForConditionalGeneration
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# model initialize form pretrained
+repo = "Salesforce/blip-image-captioning-large"
+processor = BlipProcessor.from_pretrained(repo)
+tokenizer = processor.tokenizer
+model = BlipForConditionalGeneration.from_pretrained(repo)
+# load the data configuration and split into test/valid
+dt = pd.read_json("dataset.jsonl",lines=True).dropna()
+dt["train"] = dt["fold"].apply(lambda x: 0 if x in ['p19'] else 1) # 10% of data
+dt["patient"]= dt["patient"].apply(lambda x: x[0:5])
+train=dt[dt.train==1]
+valid=dt[dt.train==0]
+# create datasets
+train_dataset = Dataset.from_dict({
+    "image": train["image"].to_list(),
+    "fold": train["fold"].to_list(),
+    "text": train["text"].to_list(),
+    "reason": train["reason"].to_list(),
+    "id": [x.split("/")[-1].replace(".jpg","") for x in train["image"].to_list()]
+}).cast_column("image", Image())
+valid_dataset = Dataset.from_dict({
+    "image": valid["image"].to_list(),
+    "fold": valid["fold"].to_list(),
+    "text": valid["text"].to_list(),
+    "reason": valid["reason"].to_list(),
+    "id": [x.split("/")[-1].replace(".jpg","") for x in valid["image"].to_list()]
+}).cast_column("image", Image())
+def transform(example_batch):
+    return processor(
+        images=[image for image in example_batch["image"]],
+        text=[text for text in example_batch["text"]],
+        return_tensors="np",
+        padding='max_length',
+        max_length=512
+    )
+# apply
+train_prepared = train_dataset.shuffle(seed=42).with_transform(transform)
+valid_prepared = valid_dataset.shuffle(seed=42).with_transform(transform)
+# " ".join(processor.batch_decode(train_prepared[0]["input_ids"])).replace(" ##","")
+training_args = TrainingArguments(
+    num_train_epochs=5,
+    evaluation_strategy="epoch",
+    save_steps=1000,
+    logging_steps=100,
+    per_device_eval_batch_size=2,
+    per_device_train_batch_size=2,
+    gradient_accumulation_steps=8,
+    lr_scheduler_type='cosine_with_restarts',
+    warmup_ratio=0.1,
+    learning_rate=5e-5,
+    save_total_limit=1,
+    output_dir="/opt/models/generate-cxr-checkpoints"
+)
+data_collator = DataCollatorForLanguageModeling(
+    tokenizer=tokenizer,
+    mlm = False
+)
+trainer = Trainer(
+    model=model,
+    tokenizer=processor,
+    args=training_args,
+    train_dataset=train_prepared,
+    eval_dataset=valid_prepared,
+    data_collator=data_collator,
+)
+trainer.train()
+trainer.save_model("/opt/models/generate-cxr")