Skip to content

Commit

Permalink
fix: convert .ipynb to .py
Browse files Browse the repository at this point in the history
  • Loading branch information
honghanhh committed Oct 28, 2024
1 parent 2ee1254 commit d71d3f2
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 1 deletion.
26 changes: 26 additions & 0 deletions lib/questions_eval/data_preprocessing/mimoracle_preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import pandas as pd
from datasets import load_dataset
from huggingface_hub import notebook_login

notebook_login()


def _preprocess(text: str) -> str:
text = text.split("\n")[-1].lower()
return text


def _resample(df: pd.DataFrame, n_sample: int, n_section: int) -> pd.DataFrame:
patterns = "allergies|history of present illness|past medical history|\
discharge medications|social history|medications on admission"
df["section_title"] = [_preprocess(x) for x in df["section_title"]]
df = df[df.section_title.str.contains(patterns)]
df = df.groupby("section_title").filter(lambda x: len(x) > n_sample)
df = df.groupby("document_id").filter(lambda x: len(x) == n_section)
return df


if __name__ == "__main__":
df = load_dataset("bio-datasets/mimoracle", split="train").to_pandas()
sample_df = _resample(df, 2, 6)
sample_df.to_csv("mimoracle_sample.csv", index=False)
2 changes: 1 addition & 1 deletion lib/questions_eval/run_mimoracle.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,7 @@ def main(cfg: DictConfig):
}
for key, value in log_dict.items():
wandb.run.summary[key] = value
wandb.log({"dataset/evaluation_mimoracle_gpt4o": wandb.Table(dataframe=df_joined)})
wandb.log({"dataset/evaluation_mimoracle_gpt4o_retest": wandb.Table(dataframe=df_joined)})
wandb.finish()


Expand Down

0 comments on commit d71d3f2

Please sign in to comment.