arkhn · honghanhh · Nov 7, 2024 · Oct 21, 2024 · Oct 21, 2024 · Oct 21, 2024
diff --git a/lib/questions_eval/bash/experiments/super_tiny.sh b/lib/questions_eval/bash/experiments/super_tiny.sh
diff --git a/lib/questions_eval/bash/experiments/tiny_mimoracle.sh b/lib/questions_eval/bash/experiments/tiny_mimoracle.sh
@@ -0,0 +1,4 @@
+# python run_mimoracle.py -m model=gpt-4o samples=18 num_questions=5
+# python run_mimoracle.py -m model=gpt-4o-mini samples=18 num_questions=5
+# python run_mimoracle.py -m model=llama3.1-405b-local samples=18 num_questions=5
+python run_mimoracle.py -m model=gpt-4o samples=2 num_questions=2
diff --git a/lib/questions_eval/configs/run_mimoracle.yaml b/lib/questions_eval/configs/run_mimoracle.yaml
@@ -0,0 +1,44 @@
+# @package _global_
+defaults:
+  - _self_
+  - model: gpt-4o-mini.yaml
+  - question_model: gpt-4o.yaml
+
+samples: 2
+num_questions: 5
+dataset: "sub_mimoracle"
+
+prompts:
+  summary: >-
+    As a clinician assistant, you must write a summary for a specified section in clinical report
+    given these patients information. section_title: {section_title} text: {text}
+
+    Synthetic Summary:
+
+  question: >-
+    As a clinical assistant, please formulate {num_questions} critical, concise and closed-ended
+    questions (in a YES/NO format) that thoroughly scrutinize the document. The questions generated
+    should ALWAYS result in a ‘YES’ based on the given text. Questions should be about the content
+    of the document and not include any qualifier of the clarity, justification or definition.
+    **Note** The questions have to be STRICTLY closed-ended and should not be subjective or open to
+    human interpretation. You should return in a JSON format. The JSON should be a list of
+    dictionaries where each dictionary will have two keys: - ‘question’: specifying the question -
+    ‘answer’: either YES or NO. The given text should be able to answer ‘YES’ for each generated
+    question.
+
+    Document: {summary}
+
+    JSON:
+
+  evaluation: >-
+    As a clinical assistant, answer the following questions with a YES or NO, grounded on the text
+    content only. Do not use any external knowledge. If you cannot answer the question based on the
+    provided text, please respond with ‘IDK’.
+
+    **Note** You should respond either YES, NO or IDK.
+
+    Document : {summary}
+
+    Question : {question}
+
+    Answer:
diff --git a/lib/questions_eval/hf_datasets/sub_mimoracle/__init__.py b/lib/questions_eval/hf_datasets/sub_mimoracle/__init__.py
diff --git a/lib/questions_eval/hf_datasets/sub_mimoracle/_preprocessing.py b/lib/questions_eval/hf_datasets/sub_mimoracle/_preprocessing.py
@@ -0,0 +1,32 @@
+import pandas as pd
+from datasets import load_dataset
+from huggingface_hub import notebook_login
+
+notebook_login()
+
+
+def _preprocess(text: str) -> str:
+    text = text.split("\n")[-1].lower()
+    return text
+
+
+def _resample(df: pd.DataFrame, n_sample: int, n_section: int) -> pd.DataFrame:
+    patterns = "allergies|history of present illness|past medical history|\
+                discharge medications|social history|medications on admission"
+    df["section_title"] = [_preprocess(x) for x in df["section_title"]]
+    df = df[df.section_title.str.contains(patterns)]
+    df = df.groupby("section_title").filter(lambda x: len(x) > n_sample)
+    df = df.groupby("document_id").filter(lambda x: len(x) == n_section)
+    df.rename(columns={"section_content": "summary"}, inplace=True)
+    return df
+
+
+def save_data_sampe(input_path: str, split: str, output_path: str):
+    df = load_dataset(input_path, split=split).to_pandas()
+    df = _resample(df, 2, 6)
+    df.to_csv(output_path, index=False)
+
+
+if __name__ == "__main__":
+    save_data_sampe("bio-datasets/mimoracle", "train", "./data/mimoracle_train.csv")
+    save_data_sampe("bio-datasets/mimoracle", "test", "./data/mimoracle_test.csv")