Skip to content

Commit

Permalink
🎉 (Mimoracle): First commit
Browse files Browse the repository at this point in the history
  • Loading branch information
louni-g committed Jan 19, 2024
1 parent 94943fb commit 5cfc606
Show file tree
Hide file tree
Showing 8 changed files with 1,558 additions and 0 deletions.
4 changes: 4 additions & 0 deletions lib/mimoracle/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# 🧚‍ MimOracle: A MIMIC-III based Oracle for Clinical Record Completion

This project contains the MimOracle dataset, based on the MIMIC-III dataset. The MimOracle dataset
is designed to be used as a benchmark for clinical record completion.
22 changes: 22 additions & 0 deletions lib/mimoracle/hf_datasets/mimic_iii/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
---
dataset_info:
config_name: mimicoracle
features:
- name: subject_id
dtype: string
- name: row_id
dtype: string
- name: title
dtype: string
- name: content
dtype: string
splits:
- name: train
num_bytes: 7362383
num_examples: 15036
- name: test
num_bytes: 2039898
num_examples: 4104
download_size: 9310737
dataset_size: 9402281
---
Empty file.
93 changes: 93 additions & 0 deletions lib/mimoracle/hf_datasets/mimic_iii/_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import logging
import re
import sqlite3
from pathlib import Path
from typing import Optional

import pandas as pd
import typer
from tqdm import tqdm

ROOT = Path(__file__).parent.parent.parent


def main(
min_docs_per_patient: int = 3,
n_patients: Optional[int] = 1000,
mimic_db_path: Path = ROOT / "data" / "mimic.db",
) -> None:
"""Preprocess the MIMIC-III dataset into train and test CSV files.
Only the discharge summaries are kept, and only for `n_patients` patients with more than
`min_docs_per_patient` documents. The discharge summaries are then split into sections.
The script is based on the `mimic.db` SQLite database, which must be downloaded beforehand.
Args:
min_docs_per_patient: Minimum number of documents a patient must have to be included.
n_patients: Number of patients to include in the dataset.
mimic_db_path: Path to the `mimic.db` SQLite database.
"""
conn = sqlite3.connect(mimic_db_path)
logging.basicConfig(level=logging.ERROR)

# Select the discharge summaries for patients with more than min_docs_per_patient documents
query = f"""
SELECT subject_id, row_id, text
FROM noteevents
WHERE category = 'Discharge summary'
AND subject_id IN (
SELECT subject_id
FROM noteevents
WHERE category = 'Discharge summary'
GROUP BY subject_id
HAVING COUNT(row_id) > {min_docs_per_patient}
)
ORDER BY subject_id;
"""
df = pd.read_sql_query(query, conn)
conn.close()

if n_patients:
df = df[:n_patients]

records = []
for _, row in tqdm(df.iterrows(), total=len(df)):
subject_id = row["subject_id"]
row_id = row["row_id"]
note = row["text"]

# Separate the note into sections, with titles and contents
titles = re.findall(r"^[A-z-\s]+:(?:\n\n|$|\s)", note, flags=re.MULTILINE)

Check warning

Code scanning / CodeQL

Overly permissive regular expression range Medium

Suspicious character range that is equivalent to [A-Z[]^_`a-z].
contents = re.split(r"^[A-z-\s]+:(?:\n\n|$|\s)", note, flags=re.MULTILINE)[1:]

Check warning

Code scanning / CodeQL

Overly permissive regular expression range Medium

Suspicious character range that is equivalent to [A-Z[]^_`a-z].
for title, content in zip(titles, contents):
title = title.strip().replace(":", "")
content = content.strip()
if len(title) < 3 or len(content.split(" ")) < 3:
# Skip sections with too few words or too short titles
continue
records.append(
{
"subject_id": subject_id,
"document_id": row_id,
"title": title,
"content": content,
}
)

df = pd.DataFrame.from_records(records)

# Split into train and test sets, by patient
subject_ids = df["subject_id"].unique()
n_train_subjects = int(0.8 * len(subject_ids))
train_subject_ids = subject_ids[:n_train_subjects]
test_subject_ids = subject_ids[n_train_subjects:]
df_train = df[df["subject_id"].isin(train_subject_ids)]
df_test = df[df["subject_id"].isin(test_subject_ids)]

df_train.to_csv(ROOT / "data" / "mimoracle_train.csv")
df_test.to_csv(ROOT / "data" / "mimoracle_test.csv")


if __name__ == "__main__":
typer.run(main)
132 changes: 132 additions & 0 deletions lib/mimoracle/hf_datasets/mimic_iii/mimoracle.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os

import datasets
import pandas as pd

_DESCRIPTION = """\
This dataset is a collection of discharge summaries from mimic-iii that have been splitted into
sections and filtered to have 1000 patients with more than 3 documents.
"""

_DATASET_NAME = "mimicoracle"

_HOMEPAGE = "https://github.com/arkhn/open-nlp"

_LICENSE = "http://www.apache.org/licenses/LICENSE-2.0"

_URLS = {"train": "data/mimoracle_train.csv", "test": "data/mimoracle_test.csv"}

_CITATION = """\
@inproceedings{10.1145/3368555.3384469,
author = {Wang, Shirly and McDermott, Matthew B. A. and Chauhan,
Geeticka and Ghassemi, Marzyeh and Hughes, Michael C. and Naumann, Tristan},
title = {MIMIC-Extract: A Data Extraction, Preprocessing, and Representation
Pipeline for MIMIC-III},
year = {2020},
isbn = {9781450370462},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3368555.3384469},
doi = {10.1145/3368555.3384469},
abstract = {Machine learning for healthcare researchers face challenges to
progress and reproducibility due to a lack of standardized processing
frameworks for public datasets. We present MIMIC-Extract, an open source pipeline
for transforming the raw electronic health record (EHR) data of critical care
patients from the publicly-available MIMIC-III database into data structures that are directly
usable in common time-series prediction pipelines. MIMIC-Extract addresses three challenges
in making complex EHR data accessible to the broader machine learning community.
First, MIMIC-Extract transforms raw vital sign and laboratory measurements into usable hourly
time series, performing essential steps such as unit conversion, outlier handling,
and aggregation of semantically similar features to reduce missingness and improve robustness.
Second, MIMIC-Extract extracts and makes prediction of clinically-relevant targets possible,
including outcomes such as mortality and length-of-stay
as well as comprehensive hourly intervention signals for ventilators, vasopressors,
and fluid therapies. Finally, the pipeline emphasizes reproducibility and
extensibility to future research questions. We demonstrate the pipeline's effectiveness
by developing several benchmark tasks for outcome and intervention forecasting and
assessing the performance of competitive models.},
booktitle = {Proceedings of the ACM Conference on Health, Inference, and Learning},
pages = {222–235},
numpages = {14},
keywords = {Machine learning, MIMIC-III, Healthcare, Time series data, Reproducibility},
location = {Toronto, Ontario, Canada},
series = {CHIL '20}}
"""


class MimoracleDataset(datasets.GeneratorBasedBuilder):
"""This is the huggingface dataset for mimic III clinical record completion.
It contains sections of discharge summaries for 1000 patients with more than 3 documents.
"""

VERSION = datasets.Version("0.1.0")
BUILDER_CONFIGS = [
datasets.BuilderConfig(
name=_DATASET_NAME,
version=VERSION,
description=_DESCRIPTION,
),
]

DEFAULT_CONFIG_NAME = _DATASET_NAME

def _info(self):
features = datasets.Features(
{
"subject_id": datasets.Value("string"),
"row_id": datasets.Value("string"),
"title": datasets.Value("string"),
"content": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
description=_DESCRIPTION,
features=features,
homepage=_HOMEPAGE,
license=_LICENSE,
citation=_CITATION,
)

def _split_generators(self, dl_manager):
data_dirs = dl_manager.download_and_extract(_URLS)
return [
datasets.SplitGenerator(
name=datasets.Split.TRAIN,
gen_kwargs={
"filepath": os.path.join(data_dirs["train"]),
"split": "train",
},
),
datasets.SplitGenerator(
name=datasets.Split.TEST,
gen_kwargs={
"filepath": os.path.join(data_dirs["test"]),
"split": "test",
},
),
]

def _generate_examples(self, filepath, split):
df = pd.read_csv(filepath)
for i, row in df.iterrows():
yield i, {
"subject_id": str(row["subject_id"]),
"row_id": str(row["document_id"]),
"title": row["title"],
"content": row["content"],
}
Empty file.
Loading

0 comments on commit 5cfc606

Please sign in to comment.