-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
8 changed files
with
1,558 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
# 🧚 MimOracle: A MIMIC-III based Oracle for Clinical Record Completion | ||
|
||
This project contains the MimOracle dataset, based on the MIMIC-III dataset. The MimOracle dataset | ||
is designed to be used as a benchmark for clinical record completion. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
--- | ||
dataset_info: | ||
config_name: mimicoracle | ||
features: | ||
- name: subject_id | ||
dtype: string | ||
- name: row_id | ||
dtype: string | ||
- name: title | ||
dtype: string | ||
- name: content | ||
dtype: string | ||
splits: | ||
- name: train | ||
num_bytes: 7362383 | ||
num_examples: 15036 | ||
- name: test | ||
num_bytes: 2039898 | ||
num_examples: 4104 | ||
download_size: 9310737 | ||
dataset_size: 9402281 | ||
--- |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,93 @@ | ||
import logging | ||
import re | ||
import sqlite3 | ||
from pathlib import Path | ||
from typing import Optional | ||
|
||
import pandas as pd | ||
import typer | ||
from tqdm import tqdm | ||
|
||
ROOT = Path(__file__).parent.parent.parent | ||
|
||
|
||
def main( | ||
min_docs_per_patient: int = 3, | ||
n_patients: Optional[int] = 1000, | ||
mimic_db_path: Path = ROOT / "data" / "mimic.db", | ||
) -> None: | ||
"""Preprocess the MIMIC-III dataset into train and test CSV files. | ||
Only the discharge summaries are kept, and only for `n_patients` patients with more than | ||
`min_docs_per_patient` documents. The discharge summaries are then split into sections. | ||
The script is based on the `mimic.db` SQLite database, which must be downloaded beforehand. | ||
Args: | ||
min_docs_per_patient: Minimum number of documents a patient must have to be included. | ||
n_patients: Number of patients to include in the dataset. | ||
mimic_db_path: Path to the `mimic.db` SQLite database. | ||
""" | ||
conn = sqlite3.connect(mimic_db_path) | ||
logging.basicConfig(level=logging.ERROR) | ||
|
||
# Select the discharge summaries for patients with more than min_docs_per_patient documents | ||
query = f""" | ||
SELECT subject_id, row_id, text | ||
FROM noteevents | ||
WHERE category = 'Discharge summary' | ||
AND subject_id IN ( | ||
SELECT subject_id | ||
FROM noteevents | ||
WHERE category = 'Discharge summary' | ||
GROUP BY subject_id | ||
HAVING COUNT(row_id) > {min_docs_per_patient} | ||
) | ||
ORDER BY subject_id; | ||
""" | ||
df = pd.read_sql_query(query, conn) | ||
conn.close() | ||
|
||
if n_patients: | ||
df = df[:n_patients] | ||
|
||
records = [] | ||
for _, row in tqdm(df.iterrows(), total=len(df)): | ||
subject_id = row["subject_id"] | ||
row_id = row["row_id"] | ||
note = row["text"] | ||
|
||
# Separate the note into sections, with titles and contents | ||
titles = re.findall(r"^[A-z-\s]+:(?:\n\n|$|\s)", note, flags=re.MULTILINE) | ||
Check warning Code scanning / CodeQL Overly permissive regular expression range Medium
Suspicious character range that is equivalent to [A-Z[]^_`a-z].
|
||
contents = re.split(r"^[A-z-\s]+:(?:\n\n|$|\s)", note, flags=re.MULTILINE)[1:] | ||
Check warning Code scanning / CodeQL Overly permissive regular expression range Medium
Suspicious character range that is equivalent to [A-Z[]^_`a-z].
|
||
for title, content in zip(titles, contents): | ||
title = title.strip().replace(":", "") | ||
content = content.strip() | ||
if len(title) < 3 or len(content.split(" ")) < 3: | ||
# Skip sections with too few words or too short titles | ||
continue | ||
records.append( | ||
{ | ||
"subject_id": subject_id, | ||
"document_id": row_id, | ||
"title": title, | ||
"content": content, | ||
} | ||
) | ||
|
||
df = pd.DataFrame.from_records(records) | ||
|
||
# Split into train and test sets, by patient | ||
subject_ids = df["subject_id"].unique() | ||
n_train_subjects = int(0.8 * len(subject_ids)) | ||
train_subject_ids = subject_ids[:n_train_subjects] | ||
test_subject_ids = subject_ids[n_train_subjects:] | ||
df_train = df[df["subject_id"].isin(train_subject_ids)] | ||
df_test = df[df["subject_id"].isin(test_subject_ids)] | ||
|
||
df_train.to_csv(ROOT / "data" / "mimoracle_train.csv") | ||
df_test.to_csv(ROOT / "data" / "mimoracle_test.csv") | ||
|
||
|
||
if __name__ == "__main__": | ||
typer.run(main) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,132 @@ | ||
# Copyright 2020 The HuggingFace Datasets Authors and the current dataset script contributor. | ||
# | ||
# Licensed under the Apache License, Version 2.0 (the "License"); | ||
# you may not use this file except in compliance with the License. | ||
# You may obtain a copy of the License at | ||
# | ||
# http://www.apache.org/licenses/LICENSE-2.0 | ||
# | ||
# Unless required by applicable law or agreed to in writing, software | ||
# distributed under the License is distributed on an "AS IS" BASIS, | ||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
# See the License for the specific language governing permissions and | ||
# limitations under the License. | ||
|
||
import os | ||
|
||
import datasets | ||
import pandas as pd | ||
|
||
_DESCRIPTION = """\ | ||
This dataset is a collection of discharge summaries from mimic-iii that have been splitted into | ||
sections and filtered to have 1000 patients with more than 3 documents. | ||
""" | ||
|
||
_DATASET_NAME = "mimicoracle" | ||
|
||
_HOMEPAGE = "https://github.com/arkhn/open-nlp" | ||
|
||
_LICENSE = "http://www.apache.org/licenses/LICENSE-2.0" | ||
|
||
_URLS = {"train": "data/mimoracle_train.csv", "test": "data/mimoracle_test.csv"} | ||
|
||
_CITATION = """\ | ||
@inproceedings{10.1145/3368555.3384469, | ||
author = {Wang, Shirly and McDermott, Matthew B. A. and Chauhan, | ||
Geeticka and Ghassemi, Marzyeh and Hughes, Michael C. and Naumann, Tristan}, | ||
title = {MIMIC-Extract: A Data Extraction, Preprocessing, and Representation | ||
Pipeline for MIMIC-III}, | ||
year = {2020}, | ||
isbn = {9781450370462}, | ||
publisher = {Association for Computing Machinery}, | ||
address = {New York, NY, USA}, | ||
url = {https://doi.org/10.1145/3368555.3384469}, | ||
doi = {10.1145/3368555.3384469}, | ||
abstract = {Machine learning for healthcare researchers face challenges to | ||
progress and reproducibility due to a lack of standardized processing | ||
frameworks for public datasets. We present MIMIC-Extract, an open source pipeline | ||
for transforming the raw electronic health record (EHR) data of critical care | ||
patients from the publicly-available MIMIC-III database into data structures that are directly | ||
usable in common time-series prediction pipelines. MIMIC-Extract addresses three challenges | ||
in making complex EHR data accessible to the broader machine learning community. | ||
First, MIMIC-Extract transforms raw vital sign and laboratory measurements into usable hourly | ||
time series, performing essential steps such as unit conversion, outlier handling, | ||
and aggregation of semantically similar features to reduce missingness and improve robustness. | ||
Second, MIMIC-Extract extracts and makes prediction of clinically-relevant targets possible, | ||
including outcomes such as mortality and length-of-stay | ||
as well as comprehensive hourly intervention signals for ventilators, vasopressors, | ||
and fluid therapies. Finally, the pipeline emphasizes reproducibility and | ||
extensibility to future research questions. We demonstrate the pipeline's effectiveness | ||
by developing several benchmark tasks for outcome and intervention forecasting and | ||
assessing the performance of competitive models.}, | ||
booktitle = {Proceedings of the ACM Conference on Health, Inference, and Learning}, | ||
pages = {222–235}, | ||
numpages = {14}, | ||
keywords = {Machine learning, MIMIC-III, Healthcare, Time series data, Reproducibility}, | ||
location = {Toronto, Ontario, Canada}, | ||
series = {CHIL '20}} | ||
""" | ||
|
||
|
||
class MimoracleDataset(datasets.GeneratorBasedBuilder): | ||
"""This is the huggingface dataset for mimic III clinical record completion. | ||
It contains sections of discharge summaries for 1000 patients with more than 3 documents. | ||
""" | ||
|
||
VERSION = datasets.Version("0.1.0") | ||
BUILDER_CONFIGS = [ | ||
datasets.BuilderConfig( | ||
name=_DATASET_NAME, | ||
version=VERSION, | ||
description=_DESCRIPTION, | ||
), | ||
] | ||
|
||
DEFAULT_CONFIG_NAME = _DATASET_NAME | ||
|
||
def _info(self): | ||
features = datasets.Features( | ||
{ | ||
"subject_id": datasets.Value("string"), | ||
"row_id": datasets.Value("string"), | ||
"title": datasets.Value("string"), | ||
"content": datasets.Value("string"), | ||
} | ||
) | ||
return datasets.DatasetInfo( | ||
description=_DESCRIPTION, | ||
features=features, | ||
homepage=_HOMEPAGE, | ||
license=_LICENSE, | ||
citation=_CITATION, | ||
) | ||
|
||
def _split_generators(self, dl_manager): | ||
data_dirs = dl_manager.download_and_extract(_URLS) | ||
return [ | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TRAIN, | ||
gen_kwargs={ | ||
"filepath": os.path.join(data_dirs["train"]), | ||
"split": "train", | ||
}, | ||
), | ||
datasets.SplitGenerator( | ||
name=datasets.Split.TEST, | ||
gen_kwargs={ | ||
"filepath": os.path.join(data_dirs["test"]), | ||
"split": "test", | ||
}, | ||
), | ||
] | ||
|
||
def _generate_examples(self, filepath, split): | ||
df = pd.read_csv(filepath) | ||
for i, row in df.iterrows(): | ||
yield i, { | ||
"subject_id": str(row["subject_id"]), | ||
"row_id": str(row["document_id"]), | ||
"title": row["title"], | ||
"content": row["content"], | ||
} |
Empty file.
Oops, something went wrong.