feat(kaggle): several update in kaggle scenarios (#476)

* udpate plot * log and reduce token * trace tag * add simple_background parameter to get_scenario_all_desc * update trace * update first version code * chat model map * add annotation for stack index * add annotation * reformatted by black * several update on kaggle scenarios * update some new change * fix CI * fix CI * fix a bug * fix bugs in graph RAG --------- Co-authored-by: Tim <[email protected]>
microsoft · Nov 6, 2024 · 245d211 · 245d211
1 parent 2fc0d77
commit 245d211
Show file tree

Hide file tree

Showing 49 changed files with 515 additions and 380 deletions.
diff --git a/rdagent/app/benchmark/factor/analysis.py b/rdagent/app/benchmark/factor/analysis.py
@@ -82,12 +82,12 @@ def result_all_key_order(self, x):
         for i in x:
             order_v.append(
                 {
-                    "avg. Run successful rate": 0,
-                    "avg. Format successful rate": 1,
-                    "avg. Correlation (value only)": 2,
-                    "max. Correlation": 3,
-                    "max. accuracy": 4,
-                    "avg. accuracy": 5,
+                    "Avg Run SR": 0,
+                    "Avg Format SR": 1,
+                    "Avg Correlation": 2,
+                    "Max Correlation": 3,
+                    "Max Accuracy": 4,
+                    "Avg Accuracy": 5,
                 }.get(i, i),
             )
         return order_v
@@ -140,12 +140,12 @@ def analyze_data(self, sum_df):
 
         result_all = pd.concat(
             {
-                "avg. Correlation (value only)": corr_res.iloc[:, 0],
-                "avg. Format successful rate": format_succ_rate_f.iloc[:, 0],
-                "avg. Run successful rate": succ_rate_f.iloc[:, 0],
-                "max. Correlation": corr_max_res.iloc[:, 0],
-                "max. accuracy": value_max_res.iloc[:, 0],
-                "avg. accuracy": value_avg_res.iloc[:, 0],
+                "Avg Correlation": corr_res.iloc[:, 0],
+                "Avg Format SR": format_succ_rate_f.iloc[:, 0],
+                "Avg Run SR": succ_rate_f.iloc[:, 0],
+                "Max Correlation": corr_max_res.iloc[:, 0],
+                "Max Accuracy": value_max_res.iloc[:, 0],
+                "Avg Accuracy": value_avg_res.iloc[:, 0],
             },
             axis=1,
         )
@@ -179,11 +179,16 @@ def change_fs(font_size):
 
     @staticmethod
     def plot_data(data, file_name, title):
-        plt.figure(figsize=(10, 6))
-        sns.barplot(x="index", y="b", hue="a", data=data)
-        plt.xlabel("Method")
+        plt.figure(figsize=(10, 10))
         plt.ylabel("Value")
-        plt.title(title)
+        colors = ["#3274A1", "#E1812C", "#3A923A", "#C03D3E"]
+        plt.bar(data["a"], data["b"], color=colors, capsize=5)
+        for idx, row in data.iterrows():
+            plt.text(idx, row["b"] + 0.01, f"{row['b']:.2f}", ha="center", va="bottom")
+        plt.suptitle(title, y=0.98)
+        plt.xticks(rotation=45)
+        plt.ylim(0, 1)
+        plt.tight_layout()
         plt.savefig(file_name)
 
 
@@ -201,7 +206,7 @@ def main(
     final_results_df = pd.DataFrame(final_results)
 
     Plotter.change_fs(20)
-    plot_data = final_results_df.drop(["max. accuracy", "avg. accuracy"], axis=0).T
+    plot_data = final_results_df.drop(["Max Accuracy", "Avg Accuracy"], axis=0).T
     plot_data = plot_data.reset_index().melt("index", var_name="a", value_name="b")
     Plotter.plot_data(plot_data, "./comparison_plot.png", title)
 

diff --git a/rdagent/app/kaggle/loop.py b/rdagent/app/kaggle/loop.py
@@ -1,5 +1,6 @@
 import subprocess
 from collections import defaultdict
+from concurrent.futures import TimeoutError
 from typing import Any
 
 import fire
@@ -115,7 +116,7 @@ def running(self, prev_out: dict[str, Any]):
 
         return exp
 
-    skip_loop_error = (ModelEmptyError, FactorEmptyError)
+    skip_loop_error = (ModelEmptyError, FactorEmptyError, TimeoutError)
 
 
 def main(path=None, step_n=None, competition=None):

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py b/rdagent/components/coder/factor_coder/CoSTEER/evaluators.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from jinja2 import Environment, StrictUndefined
 
+from rdagent.components.coder.factor_coder.config import FACTOR_IMPLEMENT_SETTINGS
 from rdagent.components.coder.factor_coder.CoSTEER.evolvable_subjects import (
     FactorEvolvingItem,
 )
@@ -92,7 +93,11 @@ def evaluate(
             .from_string(evaluate_prompts["evaluator_code_feedback_v1_system"])
             .render(
                 scenario=(
-                    self.scen.get_scenario_all_desc(target_task)
+                    self.scen.get_scenario_all_desc(
+                        target_task,
+                        filtered_tag="feature",
+                        simple_background=FACTOR_IMPLEMENT_SETTINGS.simple_background,
+                    )
                     if self.scen is not None
                     else "No scenario description."
                 )
@@ -190,15 +195,15 @@ def evaluate(
             )
         buffer = io.StringIO()
         gen_df.info(buf=buffer)
-        gen_df_info_str = f"The use is currently working on a feature related task.\nThe output dataframe info is:\n{buffer.getvalue()}"
+        gen_df_info_str = f"The user is currently working on a feature related task.\nThe output dataframe info is:\n{buffer.getvalue()}"
         system_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(
                 evaluate_prompts["evaluator_output_format_system"],
             )
             .render(
                 scenario=(
-                    self.scen.get_scenario_all_desc(implementation.target_task)
+                    self.scen.get_scenario_all_desc(implementation.target_task, filtered_tag="feature")
                     if self.scen is not None
                     else "No scenario description."
                 )
@@ -512,7 +517,7 @@ def evaluate(
             .from_string(evaluate_prompts["evaluator_final_decision_v1_system"])
             .render(
                 scenario=(
-                    self.scen.get_scenario_all_desc(target_task)
+                    self.scen.get_scenario_all_desc(target_task, filtered_tag="feature")
                     if self.scen is not None
                     else "No scenario description."
                 )

diff --git a/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/factor_coder/CoSTEER/evolving_strategy.py
@@ -234,7 +234,7 @@ def implement_one_factor(
                     implement_prompts["evolving_strategy_factor_implementation_v1_system"],
                 )
                 .render(
-                    scenario=self.scen.get_scenario_all_desc(target_task),
+                    scenario=self.scen.get_scenario_all_desc(target_task, filtered_tag="feature"),
                     queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
                 )
             )

diff --git a/rdagent/components/coder/factor_coder/config.py b/rdagent/components/coder/factor_coder/config.py
@@ -35,6 +35,9 @@ class Config:
     v2_error_summary: bool = False
     v2_knowledge_sampler: float = 1.0
 
+    simple_background: bool = False
+    """Whether to use simple background information for code feedback"""
+
     file_based_execution_timeout: int = 120
     """Timeout in seconds for each factor implementation execution"""
 

diff --git a/rdagent/components/coder/model_coder/CoSTEER/evaluators.py b/rdagent/components/coder/model_coder/CoSTEER/evaluators.py
@@ -83,7 +83,7 @@ def evaluate(
             .from_string(evaluate_prompts["evaluator_code_feedback"]["system"])
             .render(
                 scenario=(
-                    self.scen.get_scenario_all_desc(target_task)
+                    self.scen.get_scenario_all_desc(target_task, filtered_tag=target_task.model_type)
                     if self.scen is not None
                     else "No scenario description."
                 )
@@ -145,7 +145,7 @@ def evaluate(
             .from_string(evaluate_prompts["evaluator_final_feedback"]["system"])
             .render(
                 scenario=(
-                    self.scen.get_scenario_all_desc(target_task)
+                    self.scen.get_scenario_all_desc(target_task, filtered_tag=target_task.model_type)
                     if self.scen is not None
                     else "No scenario description."
                 )

diff --git a/rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/model_coder/CoSTEER/evolving_strategy.py
@@ -76,7 +76,7 @@ def implement_one_model(
                     coder_prompts["evolving_strategy_model_coder"]["system"],
                 )
                 .render(
-                    scenario=self.scen.get_scenario_all_desc(),
+                    scenario=self.scen.get_scenario_all_desc(filtered_tag=target_task.model_type),
                     queried_former_failed_knowledge=queried_former_failed_knowledge_to_render,
                     current_code=current_code,
                 )

diff --git a/rdagent/components/coder/model_coder/model_execute_template_v2.txt b/rdagent/components/coder/model_coder/model_execute_template_v2.txt
@@ -13,9 +13,11 @@ valid_y = pd.Series(np.random.randint(0, 2, 8))
 
 model = fit(train_X, train_y, valid_X, valid_y)
 execution_model_output = predict(model, valid_X)
+
 if isinstance(execution_model_output, torch.Tensor):
     execution_model_output = execution_model_output.cpu().detach().numpy()
 
+
 execution_feedback_str = f"Execution successful, output numpy ndarray shape: {execution_model_output.shape}"
 
 pickle.dump(execution_model_output, open("execution_model_output.pkl", "wb"))

diff --git a/...nt/components/proposal/factor_proposal.py → rdagent/components/proposal/__init__.py b/...nt/components/proposal/factor_proposal.py → rdagent/components/proposal/__init__.py
@@ -4,7 +4,7 @@
 
 from jinja2 import Environment, StrictUndefined
 
-from rdagent.components.coder.factor_coder.factor import FactorExperiment
+from rdagent.core.experiment import Experiment
 from rdagent.core.prompts import Prompts
 from rdagent.core.proposal import (
     Hypothesis,
@@ -18,10 +18,8 @@
 prompt_dict = Prompts(file_path=Path(__file__).parent / "prompts.yaml")
 
 
-FactorHypothesis = Hypothesis
+class LLMHypothesisGen(HypothesisGen):
 
-
-class FactorHypothesisGen(HypothesisGen):
     def __init__(self, scen: Scenario):
         super().__init__(scen)
 
@@ -30,17 +28,17 @@ def __init__(self, scen: Scenario):
     def prepare_context(self, trace: Trace) -> Tuple[dict, bool]: ...
 
     @abstractmethod
-    def convert_response(self, response: str) -> FactorHypothesis: ...
+    def convert_response(self, response: str) -> Hypothesis: ...
 
-    def gen(self, trace: Trace) -> FactorHypothesis:
+    def gen(self, trace: Trace) -> Hypothesis:
         context_dict, json_flag = self.prepare_context(trace)
 
         system_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(prompt_dict["hypothesis_gen"]["system_prompt"])
             .render(
-                targets="factors",
-                scenario=self.scen.get_scenario_all_desc(),
+                targets=self.targets,
+                scenario=self.scen.get_scenario_all_desc(filtered_tag="hypothesis_and_experiment"),
                 hypothesis_output_format=context_dict["hypothesis_output_format"],
                 hypothesis_specification=context_dict["hypothesis_specification"],
             )
@@ -49,7 +47,7 @@ def gen(self, trace: Trace) -> FactorHypothesis:
             Environment(undefined=StrictUndefined)
             .from_string(prompt_dict["hypothesis_gen"]["user_prompt"])
             .render(
-                targets="factors",
+                targets=self.targets,
                 hypothesis_and_feedback=context_dict["hypothesis_and_feedback"],
                 RAG=context_dict["RAG"],
             )
@@ -62,29 +60,47 @@ def gen(self, trace: Trace) -> FactorHypothesis:
         return hypothesis
 
 
-class FactorHypothesis2Experiment(Hypothesis2Experiment[FactorExperiment]):
+class FactorHypothesisGen(LLMHypothesisGen):
+    def __init__(self, scen: Scenario):
+        super().__init__(scen)
+        self.targets = "factors"
+
+
+class ModelHypothesisGen(LLMHypothesisGen):
+    def __init__(self, scen: Scenario):
+        super().__init__(scen)
+        self.targets = "model tuning"
+
+
+class FactorAndModelHypothesisGen(FactorHypothesisGen):
+    def __init__(self, scen: Scenario):
+        super().__init__(scen)
+        self.targets = "feature engineering and model building"
+
+
+class LLMHypothesis2Experiment(Hypothesis2Experiment[Experiment]):
     @abstractmethod
     def prepare_context(self, hypothesis: Hypothesis, trace: Trace) -> Tuple[dict, bool]: ...
 
     @abstractmethod
-    def convert_response(self, response: str, trace: Trace) -> FactorExperiment: ...
+    def convert_response(self, response: str, trace: Trace) -> Experiment: ...
 
-    def convert(self, hypothesis: Hypothesis, trace: Trace) -> FactorExperiment:
+    def convert(self, hypothesis: Hypothesis, trace: Trace) -> Experiment:
         context, json_flag = self.prepare_context(hypothesis, trace)
         system_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(prompt_dict["hypothesis2experiment"]["system_prompt"])
             .render(
-                targets="factors",
-                scenario=trace.scen.get_scenario_all_desc(),
+                targets=self.targets,
+                scenario=trace.scen.get_scenario_all_desc(filtered_tag="hypothesis_and_experiment"),
                 experiment_output_format=context["experiment_output_format"],
             )
         )
         user_prompt = (
             Environment(undefined=StrictUndefined)
             .from_string(prompt_dict["hypothesis2experiment"]["user_prompt"])
             .render(
-                targets="factors",
+                targets=self.targets,
                 target_hypothesis=context["target_hypothesis"],
                 hypothesis_and_feedback=context["hypothesis_and_feedback"],
                 target_list=context["target_list"],
@@ -95,3 +111,21 @@ def convert(self, hypothesis: Hypothesis, trace: Trace) -> FactorExperiment:
         resp = APIBackend().build_messages_and_create_chat_completion(user_prompt, system_prompt, json_mode=json_flag)
 
         return self.convert_response(resp, trace)
+
+
+class FactorHypothesis2Experiment(LLMHypothesis2Experiment):
+    def __init__(self):
+        super().__init__()
+        self.targets = "factors"
+
+
+class ModelHypothesis2Experiment(LLMHypothesis2Experiment):
+    def __init__(self):
+        super().__init__()
+        self.targets = "model tuning"
+
+
+class FactorAndModelHypothesis2Experiment(LLMHypothesis2Experiment):
+    def __init__(self):
+        super().__init__()
+        self.targets = "feature engineering and model building"