Skip to content

Commit

Permalink
✨ (Mimoracle): add chartdate
Browse files Browse the repository at this point in the history
  • Loading branch information
louni-g committed Feb 14, 2024
1 parent 5cfc606 commit daa42e1
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
12 changes: 7 additions & 5 deletions lib/mimoracle/hf_datasets/mimic_iii/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,15 @@ dataset_info:
dtype: string
- name: content
dtype: string
- name: chartdate
dtype: string
splits:
- name: train
num_bytes: 7362383
num_bytes: 7813463
num_examples: 15036
- name: test
num_bytes: 2039898
num_examples: 4104
download_size: 9310737
dataset_size: 9402281
num_bytes: 2172574
num_examples: 4129
download_size: 9836924
dataset_size: 9986037
---
4 changes: 3 additions & 1 deletion lib/mimoracle/hf_datasets/mimic_iii/_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def main(

# Select the discharge summaries for patients with more than min_docs_per_patient documents
query = f"""
SELECT subject_id, row_id, text
SELECT subject_id, row_id, text, chartdate
FROM noteevents
WHERE category = 'Discharge summary'
AND subject_id IN (
Expand All @@ -56,6 +56,7 @@ def main(
subject_id = row["subject_id"]
row_id = row["row_id"]
note = row["text"]
chartdate = row["chartdate"]

# Separate the note into sections, with titles and contents
titles = re.findall(r"^[A-z-\s]+:(?:\n\n|$|\s)", note, flags=re.MULTILINE)

Check warning

Code scanning / CodeQL

Overly permissive regular expression range Medium

Suspicious character range that is equivalent to [A-Z[]^_`a-z].
Expand All @@ -72,6 +73,7 @@ def main(
"document_id": row_id,
"title": title,
"content": content,
"chartdate": chartdate,
}
)

Expand Down
2 changes: 2 additions & 0 deletions lib/mimoracle/hf_datasets/mimic_iii/mimoracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def _info(self):
"row_id": datasets.Value("string"),
"title": datasets.Value("string"),
"content": datasets.Value("string"),
"chartdate": datasets.Value("string"),
}
)
return datasets.DatasetInfo(
Expand Down Expand Up @@ -129,4 +130,5 @@ def _generate_examples(self, filepath, split):
"row_id": str(row["document_id"]),
"title": row["title"],
"content": row["content"],
"chartdate": row["chartdate"],
}

0 comments on commit daa42e1

Please sign in to comment.