Skip to content

Commit

Permalink
refactor: fix data input as the name of the hydra params using only l…
Browse files Browse the repository at this point in the history
…oad_dataset hf function
  • Loading branch information
honghanhh committed Oct 30, 2024
1 parent 31758e3 commit 8f1262b
Show file tree
Hide file tree
Showing 9 changed files with 6,797 additions and 549,177 deletions.
2 changes: 1 addition & 1 deletion lib/questions_eval/configs/run_mimoracle.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ defaults:

samples: 2
num_questions: 5
dataset: "/Users/hanh.tran/Desktop/open-nlp/lib/questions_eval/data_sample/mimoracle_train_sample.csv"
dataset: "mimoracle"

prompts:
summary: >-
Expand Down
549,168 changes: 0 additions & 549,168 deletions lib/questions_eval/data_sample/mimoracle_train_sample.csv

This file was deleted.

Empty file.
13 changes: 9 additions & 4 deletions lib/questions_eval/hf_datasets/mimoracle/_preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,16 @@ def _resample(df: pd.DataFrame, n_sample: int, n_section: int) -> pd.DataFrame:
df = df[df.section_title.str.contains(patterns)]
df = df.groupby("section_title").filter(lambda x: len(x) > n_sample)
df = df.groupby("document_id").filter(lambda x: len(x) == n_section)
df.rename(columns={"section_content": "summary"}, inplace=True)
return df


if __name__ == "__main__":
df = load_dataset("bio-datasets/mimoracle", split="train").to_pandas()
sample_df = _resample(df, 2, 6)
def save_data_sampe(input_path: str, split: str, output_path: str):
df = load_dataset(input_path, split=split).to_pandas()
df = _resample(df, 2, 6)
df.to_csv(output_path, index=False)


sample_df.to_csv("mimoracle_sample.csv", index=False)
if __name__ == "__main__":
save_data_sampe("bio-datasets/mimoracle", "train", "./data/mimoracle_train.csv")
save_data_sampe("bio-datasets/mimoracle", "test", "./data/mimoracle_test.csv")
Loading

0 comments on commit 8f1262b

Please sign in to comment.