🎉 (Mimoracle): First commit

arkhn · Jan 19, 2024 · 5cfc606 · 5cfc606
1 parent 94943fb
commit 5cfc606
Show file tree

Hide file tree

Showing 8 changed files with 1,558 additions and 0 deletions.
diff --git a/lib/mimoracle/README.md b/lib/mimoracle/README.md
@@ -0,0 +1,4 @@
+# 🧚‍ MimOracle: A MIMIC-III based Oracle for Clinical Record Completion
+
+This project contains the MimOracle dataset, based on the MIMIC-III dataset. The MimOracle dataset
+is designed to be used as a benchmark for clinical record completion.
diff --git a/lib/mimoracle/hf_datasets/mimic_iii/README.md b/lib/mimoracle/hf_datasets/mimic_iii/README.md
@@ -0,0 +1,22 @@
+---
+dataset_info:
+  config_name: mimicoracle
+  features:
+    - name: subject_id
+      dtype: string
+    - name: row_id
+      dtype: string
+    - name: title
+      dtype: string
+    - name: content
+      dtype: string
+  splits:
+    - name: train
+      num_bytes: 7362383
+      num_examples: 15036
+    - name: test
+      num_bytes: 2039898
+      num_examples: 4104
+  download_size: 9310737
+  dataset_size: 9402281
+---
diff --git a/lib/mimoracle/hf_datasets/mimic_iii/__init__.py b/lib/mimoracle/hf_datasets/mimic_iii/__init__.py
diff --git a/lib/mimoracle/hf_datasets/mimic_iii/_processing.py b/lib/mimoracle/hf_datasets/mimic_iii/_processing.py
@@ -0,0 +1,93 @@
+import logging
+import re
+import sqlite3
+from pathlib import Path
+from typing import Optional
+
+import pandas as pd
+import typer
+from tqdm import tqdm
+
+ROOT = Path(__file__).parent.parent.parent
+
+
+def main(
+    min_docs_per_patient: int = 3,
+    n_patients: Optional[int] = 1000,
+    mimic_db_path: Path = ROOT / "data" / "mimic.db",
+) -> None:
+    """Preprocess the MIMIC-III dataset into train and test CSV files.
+
+    Only the discharge summaries are kept, and only for `n_patients` patients with more than
+    `min_docs_per_patient` documents. The discharge summaries are then split into sections.
+
+    The script is based on the `mimic.db` SQLite database, which must be downloaded beforehand.
+
+    Args:
+        min_docs_per_patient: Minimum number of documents a patient must have to be included.
+        n_patients: Number of patients to include in the dataset.
+        mimic_db_path: Path to the `mimic.db` SQLite database.
+    """
+    conn = sqlite3.connect(mimic_db_path)
+    logging.basicConfig(level=logging.ERROR)
+
+    # Select the discharge summaries for patients with more than min_docs_per_patient documents
+    query = f"""
+        SELECT subject_id, row_id, text
+        FROM noteevents
+        WHERE category = 'Discharge summary'
+        AND subject_id IN (
+            SELECT subject_id
+            FROM noteevents
+            WHERE category = 'Discharge summary'
+            GROUP BY subject_id
+            HAVING COUNT(row_id) > {min_docs_per_patient}
+        )
+        ORDER BY subject_id;
+    """
+    df = pd.read_sql_query(query, conn)
+    conn.close()
+
+    if n_patients:
+        df = df[:n_patients]
+
+    records = []
+    for _, row in tqdm(df.iterrows(), total=len(df)):
+        subject_id = row["subject_id"]
+        row_id = row["row_id"]
+        note = row["text"]
+
+        # Separate the note into sections, with titles and contents
+        titles = re.findall(r"^[A-z-\s]+:(?:\n\n|$|\s)", note, flags=re.MULTILINE)
+        contents = re.split(r"^[A-z-\s]+:(?:\n\n|$|\s)", note, flags=re.MULTILINE)[1:]
+        for title, content in zip(titles, contents):
+            title = title.strip().replace(":", "")
+            content = content.strip()
+            if len(title) < 3 or len(content.split(" ")) < 3:
+                # Skip sections with too few words or too short titles
+                continue
+            records.append(
+                {
+                    "subject_id": subject_id,
+                    "document_id": row_id,
+                    "title": title,
+                    "content": content,
+                }
+            )
+
+    df = pd.DataFrame.from_records(records)
+
+    # Split into train and test sets, by patient
+    subject_ids = df["subject_id"].unique()
+    n_train_subjects = int(0.8 * len(subject_ids))
+    train_subject_ids = subject_ids[:n_train_subjects]
+    test_subject_ids = subject_ids[n_train_subjects:]
+    df_train = df[df["subject_id"].isin(train_subject_ids)]
+    df_test = df[df["subject_id"].isin(test_subject_ids)]
+
+    df_train.to_csv(ROOT / "data" / "mimoracle_train.csv")
+    df_test.to_csv(ROOT / "data" / "mimoracle_test.csv")
+
+
+if __name__ == "__main__":
+    typer.run(main)
diff --git a/lib/mimoracle/hf_datasets/mimic_iii/mimoracle.py b/lib/mimoracle/hf_datasets/mimic_iii/mimoracle.py
@@ -0,0 +1,132 @@
+# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import datasets
+import pandas as pd
+
+_DESCRIPTION = """\
+This dataset is a collection of discharge summaries from mimic-iii that have been splitted into
+sections and filtered to have 1000 patients with more than 3 documents.
+"""
+
+_DATASET_NAME = "mimicoracle"
+
+_HOMEPAGE = "https://github.com/arkhn/open-nlp"
+
+_LICENSE = "http://www.apache.org/licenses/LICENSE-2.0"
+
+_URLS = {"train": "data/mimoracle_train.csv", "test": "data/mimoracle_test.csv"}
+
+_CITATION = """\
+    @inproceedings{10.1145/3368555.3384469,
+    author = {Wang, Shirly and McDermott, Matthew B. A. and Chauhan,
+    Geeticka and Ghassemi, Marzyeh and Hughes, Michael C. and Naumann, Tristan},
+    title = {MIMIC-Extract: A Data Extraction, Preprocessing, and Representation
+    Pipeline for MIMIC-III},
+    year = {2020},
+    isbn = {9781450370462},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    url = {https://doi.org/10.1145/3368555.3384469},
+    doi = {10.1145/3368555.3384469},
+    abstract = {Machine learning for healthcare researchers face challenges to
+    progress and reproducibility due to a lack of standardized processing
+    frameworks for public datasets. We present MIMIC-Extract, an open source pipeline
+    for transforming the raw electronic health record (EHR) data of critical care
+    patients from the publicly-available MIMIC-III database into data structures that are directly
+    usable in common time-series prediction pipelines. MIMIC-Extract addresses three challenges
+    in making complex EHR data accessible to the broader machine learning community.
+    First, MIMIC-Extract transforms raw vital sign and laboratory measurements into usable hourly
+    time series, performing essential steps such as unit conversion, outlier handling,
+    and aggregation of semantically similar features to reduce missingness and improve robustness.
+    Second, MIMIC-Extract extracts and makes prediction of clinically-relevant targets possible,
+    including outcomes such as mortality and length-of-stay
+    as well as comprehensive hourly intervention signals for ventilators, vasopressors,
+    and fluid therapies. Finally, the pipeline emphasizes reproducibility and
+    extensibility to future research questions. We demonstrate the pipeline's effectiveness
+    by developing several benchmark tasks for outcome and intervention forecasting and
+    assessing the performance of competitive models.},
+    booktitle = {Proceedings of the ACM Conference on Health, Inference, and Learning},
+    pages = {222–235},
+    numpages = {14},
+    keywords = {Machine learning, MIMIC-III, Healthcare, Time series data, Reproducibility},
+    location = {Toronto, Ontario, Canada},
+    series = {CHIL '20}}
+"""
+
+
+class MimoracleDataset(datasets.GeneratorBasedBuilder):
+    """This is the huggingface dataset for mimic III clinical record completion.
+
+    It contains sections of discharge summaries for 1000 patients with more than 3 documents.
+    """
+
+    VERSION = datasets.Version("0.1.0")
+    BUILDER_CONFIGS = [
+        datasets.BuilderConfig(
+            name=_DATASET_NAME,
+            version=VERSION,
+            description=_DESCRIPTION,
+        ),
+    ]
+
+    DEFAULT_CONFIG_NAME = _DATASET_NAME
+
+    def _info(self):
+        features = datasets.Features(
+            {
+                "subject_id": datasets.Value("string"),
+                "row_id": datasets.Value("string"),
+                "title": datasets.Value("string"),
+                "content": datasets.Value("string"),
+            }
+        )
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=features,
+            homepage=_HOMEPAGE,
+            license=_LICENSE,
+            citation=_CITATION,
+        )
+
+    def _split_generators(self, dl_manager):
+        data_dirs = dl_manager.download_and_extract(_URLS)
+        return [
+            datasets.SplitGenerator(
+                name=datasets.Split.TRAIN,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dirs["train"]),
+                    "split": "train",
+                },
+            ),
+            datasets.SplitGenerator(
+                name=datasets.Split.TEST,
+                gen_kwargs={
+                    "filepath": os.path.join(data_dirs["test"]),
+                    "split": "test",
+                },
+            ),
+        ]
+
+    def _generate_examples(self, filepath, split):
+        df = pd.read_csv(filepath)
+        for i, row in df.iterrows():
+            yield i, {
+                "subject_id": str(row["subject_id"]),
+                "row_id": str(row["document_id"]),
+                "title": row["title"],
+                "content": row["content"],
+            }
diff --git a/lib/mimoracle/mimoracle/__init__.py b/lib/mimoracle/mimoracle/__init__.py