feat: align mlebench data and evaluation & several fix on kaggle work…

…flow (#477) * several improvement on kaggle loop * small refinement on prompt * fix bugs * add the score of each model in every experiment * fix ci error * fix error in ventilator tpl * fix CI --------- Co-authored-by: Xu Yang <[email protected]> Co-authored-by: Bowen Xian <[email protected]> Co-authored-by: WinstonLiye <[email protected]> Co-authored-by: TPLin22 <[email protected]>
microsoft · Nov 15, 2024 · f6c522b · f6c522b
1 parent 2e38000
commit f6c522b
Show file tree

Hide file tree

Showing 25 changed files with 365 additions and 210 deletions.
diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -1,6 +1,4 @@
 import subprocess
-from collections import defaultdict
-from concurrent.futures import TimeoutError
 from typing import Any
 
 import fire
@@ -14,7 +12,6 @@
     Hypothesis2Experiment,
     HypothesisExperiment2Feedback,
     HypothesisGen,
-    Trace,
 )
 from rdagent.core.scenario import Scenario
 from rdagent.core.utils import import_class
@@ -116,7 +113,7 @@ def running(self, prev_out: dict[str, Any]):
 
         return exp
 
-    skip_loop_error = (ModelEmptyError, FactorEmptyError, TimeoutError)
+    skip_loop_error = (ModelEmptyError, FactorEmptyError)
 
 
 def main(path=None, step_n=None, competition=None):

diff --git a/rdagent/components/coder/model_coder/CoSTEER/evaluators.py b/rdagent/components/coder/model_coder/CoSTEER/evaluators.py
@@ -283,7 +283,10 @@ def evaluate(
         else:
             gt_np_array = None
 
-        shape_feedback, shape_decision = shape_evaluator(gen_np_array, (batch_size, 1))
+        shape_feedback, shape_decision = shape_evaluator(
+            gen_np_array,
+            (batch_size, self.scen.model_output_channel if hasattr(self.scen, "model_output_channel") else 1),
+        )
         value_feedback, value_decision = value_evaluator(gen_np_array, gt_np_array)
         code_feedback, _ = ModelCodeEvaluator(scen=self.scen).evaluate(
             target_task=target_task,

diff --git a/rdagent/components/knowledge_management/graph.py b/rdagent/components/knowledge_management/graph.py
@@ -22,6 +22,7 @@ class UndirectedNode(Node):
     def __init__(self, content: str = "", label: str = "", embedding: Any = None) -> None:
         super().__init__(content, label, embedding)
         self.neighbors: set[UndirectedNode] = set()
+        assert isinstance(content, str), "content must be a string"
 
     def add_neighbor(self, node: UndirectedNode) -> None:
         self.neighbors.add(node)

diff --git a/rdagent/components/proposal/__init__.py b/rdagent/components/proposal/__init__.py
@@ -72,7 +72,7 @@ def __init__(self, scen: Scenario):
         self.targets = "model tuning"
 
 
-class FactorAndModelHypothesisGen(FactorHypothesisGen):
+class FactorAndModelHypothesisGen(LLMHypothesisGen):
     def __init__(self, scen: Scenario):
         super().__init__(scen)
         self.targets = "feature engineering and model building"

diff --git a/rdagent/components/runner/__init__.py b/rdagent/components/runner/__init__.py
@@ -21,12 +21,5 @@ def get_cache_key(self, exp: Experiment) -> str:
     def assign_cached_result(self, exp: Experiment, cached_res: Experiment) -> Experiment:
         if exp.based_experiments and exp.based_experiments[-1].result is None:
             exp.based_experiments[-1].result = cached_res.based_experiments[-1].result
-        if cached_res.experiment_workspace.workspace_path.exists():
-            for csv_file in cached_res.experiment_workspace.workspace_path.glob("*.csv"):
-                shutil.copy(csv_file, exp.experiment_workspace.workspace_path)
-            for py_file in (cached_res.experiment_workspace.workspace_path / "feature").glob("*.py"):
-                shutil.copy(py_file, exp.experiment_workspace.workspace_path / "feature")
-            for py_file in (cached_res.experiment_workspace.workspace_path / "model").glob("*.py"):
-                shutil.copy(py_file, exp.experiment_workspace.workspace_path / "model")
         exp.result = cached_res.result
         return exp
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -210,6 +210,7 @@ def __init__(
         self.sub_workspace_list: list[ASpecificWSForSubTasks | None] = [None] * len(self.sub_tasks)
         self.based_experiments: Sequence[ASpecificWSForExperiment] = based_experiments
         self.result: object = None  # The result of the experiment, can be different types in different scenarios.
+        self.sub_results: dict[str, float] = {}
         self.experiment_workspace: ASpecificWSForExperiment | None = None
 
 

diff --git a/rdagent/core/utils.py b/rdagent/core/utils.py
@@ -142,7 +142,7 @@ def multiprocessing_wrapper(func_calls: list[tuple[Callable, tuple]], n: int) ->
     list
 
     """
-    if n == 1:
+    if n == 1 or max(1, min(n, len(func_calls))) == 1:
         return [f(*args) for f, args in func_calls]
 
     with mp.Pool(processes=max(1, min(n, len(func_calls)))) as pool:

diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py
@@ -409,7 +409,7 @@ def summary_window():
                     if state.alpha158_metrics is not None:
                         selected = ["alpha158"] + [i for i in df.index if state.h_decisions[int(i[6:])]]
                     else:
-                        selected = [i for i in df.index if state.h_decisions[int(i[6:])]]
+                        selected = [i for i in df.index if i == "Baseline" or state.h_decisions[int(i[6:])]]
                     df = df.loc[selected]
                 if df.shape[0] == 1:
                     st.table(df.iloc[0])
@@ -637,6 +637,7 @@ def evolving_window():
         for j, w in enumerate(ws):
             with wtabs[j]:
                 # Evolving Code
+                st.markdown(f"**Workspace Path**: {w.workspace_path}")
                 for k, v in w.code_dict.items():
                     with st.expander(f":green[`{k}`]", expanded=True):
                         st.code(v, language="python")
@@ -681,6 +682,7 @@ def evolving_window():
                 st.text_input("log path", key="log_path", on_change=refresh, label_visibility="collapsed")
             else:
                 folders = [folder.relative_to(main_log_path) for folder in main_log_path.iterdir() if folder.is_dir()]
+                folders = sorted(folders, key=lambda x: x.name)
                 st.selectbox(f"**Select from `{main_log_path}`**", folders, key="log_path", on_change=refresh)
         else:
             st.text_input(":blue[**log path**]", key="log_path", on_change=refresh)

diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -4,6 +4,7 @@
 import pandas as pd
 from jinja2 import Environment, StrictUndefined
 
+from rdagent.components.knowledge_management.graph import UndirectedNode
 from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
@@ -14,6 +15,7 @@
 )
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import APIBackend
+from rdagent.scenarios.kaggle.experiment.kaggle_experiment import KG_SELECT_MAPPING
 from rdagent.utils import convert2bool
 
 prompt_dict = Prompts(file_path=Path(__file__).parent.parent / "prompts.yaml")
@@ -59,17 +61,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             Any: The feedback generated for the given experiment and hypothesis.
         """
         logger.info("Generating feedback...")
-        hypothesis_text = hypothesis.hypothesis
         current_result = exp.result
-        tasks_factors = []
-        if exp.sub_tasks:
-            tasks_factors = []
-            for task in exp.sub_tasks:
-                try:
-                    task_info = task.get_task_information_and_implementation_result()
-                    tasks_factors.append(task_info)
-                except AttributeError:
-                    print(f"Warning: Task {task} does not have get_task_information_and_implementation_result method")
 
         evaluation_description = None
         # Check if there are any based experiments
@@ -84,11 +76,6 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             )  # Compare with itself
             print("Warning: No previous experiments to compare against. Using current result as baseline.")
 
-        available_features = {
-            task_info: feature_shape for task_info, feature_shape in exp.experiment_workspace.data_description
-        }
-        model_code = exp.experiment_workspace.model_description
-
         # Generate the user prompt based on the action type
         if hypothesis.action == "Model tuning":
             prompt_key = "model_tuning_feedback_generation"
@@ -104,35 +91,56 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             .render(scenario=self.scen.get_scenario_all_desc(filtered_tag="feedback"))
         )
 
-        last_task_and_code = None
-        if trace.hist:
-            last_task_and_code = (
-                trace.hist[-1][1].experiment_workspace.data_description
-                if trace.hist[-1][0].action == "Feature engineering" or trace.hist[-1][0].action == "Feature processing"
-                else trace.hist[-1][1].experiment_workspace.model_description
-            )
+        sota_exp = exp.based_experiments[-1] if exp.based_experiments else None
+        assert sota_exp is not None
+        sota_features = str(exp.based_experiments[-1].experiment_workspace.data_description)
+        sota_models = json.dumps(exp.based_experiments[-1].experiment_workspace.model_description, indent=2)
+        sota_result = exp.based_experiments[-1].result
+        sota_sub_results = exp.based_experiments[-1].sub_results
+
+        current_hypothesis = hypothesis.hypothesis
+        current_hypothesis_reason = hypothesis.reason
+        current_target_action = hypothesis.action
+        current_sub_exps_to_code = {}
+        if hypothesis.action == "Model tuning":
+            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.sub_workspace_list[0].code
+        elif hypothesis.action == "Model feature selection":
+            current_sub_exps_to_code[exp.sub_tasks[0].get_task_information()] = exp.experiment_workspace.code_dict[
+                KG_SELECT_MAPPING[exp.sub_tasks[0].model_type]
+            ]
+        else:
+            current_sub_exps_to_code = {
+                sub_ws.target_task.get_task_information(): sub_ws.code for sub_ws in exp.sub_workspace_list
+            }
+        current_sub_exps_to_code_str = json.dumps(current_sub_exps_to_code, indent=2)
+        current_result = exp.result
+        current_sub_results = exp.sub_results
+
+        last_hypothesis_and_feedback = None
+        if trace.hist and len(trace.hist) > 0:
+            last_hypothesis_and_feedback = (trace.hist[-1][0], trace.hist[-1][2])
 
         # Prepare render dictionary
         render_dict = {
-            "last_hypothesis": trace.hist[-1][0] if trace.hist else None,
-            "last_task_and_code": last_task_and_code,
-            "last_result": trace.hist[-1][1].result if trace.hist else None,
-            "sota_task_and_code": (
-                exp.based_experiments[-1].experiment_workspace.data_description if exp.based_experiments else None
-            ),
-            "sota_result": exp.based_experiments[-1].result if exp.based_experiments else None,
-            "hypothesis": hypothesis,
-            "exp": exp,
-            "model_code": model_code,  # This turn
-            "available_features": available_features,  # This turn
-            "combined_result": combined_result,  # This turn and sota
-            "hypothesis_text": hypothesis_text,  # This turn
-            "task_details": tasks_factors,  # This turn
+            "sota_features": sota_features,
+            "sota_models": sota_models,
+            "sota_result": sota_result,
+            "sota_sub_results": sota_sub_results,
+            "current_hypothesis": current_hypothesis,
+            "current_hypothesis_reason": current_hypothesis_reason,
+            "current_target_action": current_target_action,
+            "current_sub_exps_to_code": current_sub_exps_to_code_str,
+            "current_result": current_result,
+            "current_sub_results": current_sub_results,
+            "combined_result": combined_result,
             "evaluation_description": evaluation_description,
+            "last_hypothesis_and_feedback": last_hypothesis_and_feedback,
         }
 
         usr_prompt = (
-            Environment(undefined=StrictUndefined).from_string(prompt_dict[prompt_key]["user"]).render(**render_dict)
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["kg_feedback_generation_user"])
+            .render(**render_dict)
         )
 
         response = APIBackend().build_messages_and_create_chat_completion(
@@ -160,22 +168,29 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
         percentile_ranking = (insert_position) / (len(sorted_scores)) * 100
 
         experiment_feedback = {
-            "current_competition": self.scen.get_competition_full_desc(),
-            "hypothesis_text": hypothesis_text,
+            "hypothesis_text": current_hypothesis,
+            "tasks_factors": current_sub_exps_to_code,
             "current_result": current_result,
-            "model_code": model_code,
-            "available_features": available_features,
-            "observations": observations,
-            "hypothesis_evaluation": hypothesis_evaluation,
-            "reason": reason,
-            "percentile_ranking": percentile_ranking,
         }
 
         if self.scen.if_using_vector_rag:
+            raise NotImplementedError("Vector RAG is not implemented yet since there are plenty bugs!")
             self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
             self.scen.vector_base.dump()
         elif self.scen.if_using_graph_rag:
-            trace.knowledge_base.add_document(experiment_feedback, self.scen)
+            competition_node = UndirectedNode(content=self.scen.get_competition_full_desc(), label="competition")
+            hypothesis_node = UndirectedNode(content=hypothesis.hypothesis, label=hypothesis.action)
+            exp_code_nodes = []
+            for exp, code in current_sub_exps_to_code.items():
+                exp_code_nodes.append(UndirectedNode(content=exp, label="experiments"))
+                if code != "":
+                    exp_code_nodes.append(UndirectedNode(content=code, label="code"))
+            conclusion_node = UndirectedNode(content=response, label="conclusion")
+            all_nodes = [competition_node, hypothesis_node, *exp_code_nodes, conclusion_node]
+            all_nodes = trace.knowledge_base.batch_embedding(all_nodes)
+            for node in all_nodes:
+                if node is not competition_node:
+                    trace.knowledge_base.add_node(node, competition_node)
 
         if self.scen.if_action_choosing_based_on_UCB:
             self.scen.action_counts[hypothesis.action] += 1

diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
@@ -3,9 +3,11 @@
 import shutil
 from pathlib import Path
 
+import pandas as pd
+
 from rdagent.components.runner import CachedRunner
 from rdagent.core.exception import CoderError, FactorEmptyError, ModelEmptyError
-from rdagent.core.experiment import ASpecificExp
+from rdagent.core.experiment import ASpecificExp, Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.utils import cache_with_pickle
 from rdagent.oai.llm_utils import md5_hash
@@ -28,6 +30,18 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
         cached_key_from_exp = CachedRunner.get_cache_key(self, exp)
         return md5_hash(codes + cached_key_from_exp)
 
+    def assign_cached_result(self, exp: Experiment, cached_res: Experiment) -> Experiment:
+        exp = CachedRunner.assign_cached_result(self, exp, cached_res)
+        if cached_res.experiment_workspace.workspace_path.exists():
+            for csv_file in cached_res.experiment_workspace.workspace_path.glob("*.csv"):
+                shutil.copy(csv_file, exp.experiment_workspace.workspace_path)
+            for py_file in (cached_res.experiment_workspace.workspace_path / "feature").glob("*.py"):
+                shutil.copy(py_file, exp.experiment_workspace.workspace_path / "feature")
+            for py_file in (cached_res.experiment_workspace.workspace_path / "model").glob("*.py"):
+                shutil.copy(py_file, exp.experiment_workspace.workspace_path / "model")
+        exp.experiment_workspace.data_description = cached_res.experiment_workspace.data_description
+        return exp
+
     @cache_with_pickle(get_cache_key, CachedRunner.assign_cached_result)
     def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
         """
@@ -40,6 +54,11 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
 
         exp.result = result
 
+        sub_result_score_path = Path(exp.experiment_workspace.workspace_path) / "sub_submission_score.csv"
+        if sub_result_score_path.exists():
+            sub_submission_df = pd.read_csv(sub_result_score_path)
+            exp.sub_results = sub_submission_df.set_index("Model")["score"].to_dict()
+
         return exp
 
 
@@ -67,6 +86,10 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
             raise CoderError("No result is returned from the experiment workspace")
 
         exp.result = result
+        sub_result_score_path = Path(exp.experiment_workspace.workspace_path) / "sub_submission_score.csv"
+        if sub_result_score_path.exists():
+            sub_submission_df = pd.read_csv(sub_result_score_path)
+            exp.sub_results = sub_submission_df.set_index("Model")["score"].to_dict()
 
         return exp
 
@@ -79,10 +102,13 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
         for sub_ws in exp.sub_workspace_list:
             if sub_ws.code_dict == {}:
                 continue
+            execued_df = sub_ws.execute()[1]
+            if execued_df is None:
+                continue
             implemented_factor_count += 1
             target_feature_file_name = f"feature/feature_{current_feature_file_count:05d}.py"
             exp.experiment_workspace.inject_code(**{target_feature_file_name: sub_ws.code_dict["factor.py"]})
-            feature_shape = sub_ws.execute()[1].shape[-1]
+            feature_shape = execued_df.shape[-1]
             exp.experiment_workspace.data_description.append((sub_ws.target_task.get_task_information(), feature_shape))
             current_feature_file_count += 1
         if implemented_factor_count == 0:
@@ -100,5 +126,9 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
             raise CoderError("No result is returned from the experiment workspace")
 
         exp.result = result
+        sub_result_score_path = Path(exp.experiment_workspace.workspace_path) / "sub_submission_score.csv"
+        if sub_result_score_path.exists():
+            sub_submission_df = pd.read_csv(sub_result_score_path)
+            exp.sub_results = sub_submission_df.set_index("Model")["score"].to_dict()
 
         return exp
diff --git a/rdagent/scenarios/kaggle/docker/Dockerfile → ...os/kaggle/docker/kaggle_docker/Dockerfile b/rdagent/scenarios/kaggle/docker/Dockerfile → ...os/kaggle/docker/kaggle_docker/Dockerfile
@@ -17,13 +17,14 @@ RUN python -m pip install numpy
 RUN python -m pip install pandas
 # RUN pip install pyg_lib torch_scatter torch_sparse torch_cluster -f https://data.pyg.org/whl/torch-2.3.0%2Bcu121.html
 RUN pip install torch_geometric
+RUN pip install pytorch_lightning
 RUN pip install ogb
 RUN pip install networkx
 RUN pip install scikit-learn
 RUN pip install catboost
 RUN pip install xgboost
 RUN pip install sparse
-RUN pip install lightgbm 
+RUN pip install lightgbm==3.3.5
 RUN pip install pyarrow
 RUN pip install fastparquet
 RUN pip install optuna
diff --git a/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile b/rdagent/scenarios/kaggle/docker/mle_bench_docker/Dockerfile
@@ -0,0 +1,17 @@
+FROM pytorch/pytorch:2.4.0-cuda12.4-cudnn9-runtime
+# For GPU support, please choose the proper tag from https://hub.docker.com/r/pytorch/pytorch/tags
+
+RUN apt-get clean && apt-get update && apt-get install -y \  
+    curl \  
+    vim \  
+    git \  
+    build-essential \
+    git-lfs \
+    && rm -rf /var/lib/apt/lists/* 
+
+RUN git clone https://github.com/openai/mle-bench.git
+RUN cd mle-bench && git lfs fetch --all
+RUN cd mle-bench && git lfs pull
+RUN cd mle-bench && python -m pip install -e .
+
+WORKDIR /workspace