microsoft · xisen-w · Sep 26, 2024 · Sep 26, 2024 · Sep 27, 2024 · Sep 27, 2024
diff --git a/README.md b/README.md
@@ -18,11 +18,14 @@
 [![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
 [![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
 [![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
+[![Chat](https://img.shields.io/badge/chat-discord-blue)](https://discord.gg/ybQ97B6Jjy)
 [![Readthedocs Preview](https://github.com/microsoft/RD-Agent/actions/workflows/readthedocs-preview.yml/badge.svg)](https://github.com/microsoft/RD-Agent/actions/workflows/readthedocs-preview.yml) <!-- this badge is too long, please place it in the last one to make it pretty --> 
 
 # 📰 News
 | 🗞️ News        | 📝 Description                 |
 | --            | ------                        |
+| Official WeChat group release  | We created a WeChat group, welcome to join! (🗪[QR Code](docs/WeChat_QR_code.jpg)) |
+| Official Discord release  | We launch our first chatting channel in Discord (🗪[![Chat](https://img.shields.io/badge/chat-discord-blue)](https://discord.gg/ybQ97B6Jjy)) |
 | First release | **RDAgent** is released on GitHub |
 
 

diff --git a/docs/WeChat_QR_code.jpg b/docs/WeChat_QR_code.jpg
diff --git a/rdagent/log/ui/app.py b/rdagent/log/ui/app.py
@@ -443,9 +443,10 @@ def tasks_window(tasks: list[FactorTask | ModelTask]):
                 st.latex(ft.factor_formulation)
 
                 mks = "| Variable | Description |\n| --- | --- |\n"
-                for v, d in ft.variables.items():
-                    mks += f"| ${v}$ | {d} |\n"
-                st.markdown(mks)
+                if isinstance(ft.variables, dict):
+                    for v, d in ft.variables.items():
+                        mks += f"| ${v}$ | {d} |\n"
+                    st.markdown(mks)
 
     elif isinstance(tasks[0], ModelTask):
         st.markdown("**Model Tasks🚩**")

diff --git a/rdagent/scenarios/kaggle/developer/feedback.py b/rdagent/scenarios/kaggle/developer/feedback.py
@@ -163,7 +163,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
             self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
             self.scen.vector_base.save()
         elif self.scen.if_using_graph_rag:
-            self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
+            trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
 
         return HypothesisFeedback(
             observations=observations,

diff --git a/rdagent/scenarios/kaggle/developer/runner.py b/rdagent/scenarios/kaggle/developer/runner.py
@@ -32,6 +32,48 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
         codes = "\n".join(codes)
         return md5_hash(codes)
 
+    def extract_model_task_from_code(self, code: str) -> str:
+        sys_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["extract_model_task_from_code"]["system"])
+            .render()
+        )
+
+        user_prompt = (
+            Environment(undefined=StrictUndefined)
+            .from_string(prompt_dict["extract_model_task_from_code"]["user"])
+            .render(file_content=code)
+        )
+
+        model_task_description = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+        )
+
+        try:
+            response_json_analysis = json.loads(model_task_description)
+            task_desc = f"""name: {response_json_analysis['name']}
+        description: {response_json_analysis['description']}
+        """
+            task_desc += (
+                f"formulation: {response_json_analysis['formulation']}\n"
+                if response_json_analysis.get("formulation")
+                else ""
+            )
+            task_desc += f"architecture: {response_json_analysis['architecture']}\n"
+            task_desc += (
+                f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
+                if response_json_analysis.get("variables")
+                else ""
+            )
+            task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
+            task_desc += f"model_type: {response_json_analysis['model_type']}\n"
+        except json.JSONDecodeError:
+            task_desc = "Failed to parse LLM's response as JSON"
+
+        return task_desc
+
     def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
         """
         For the initial development, the experiment serves as a benchmark for feature engineering.
@@ -59,21 +101,27 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
         feature_shape = org_data.shape[-1]
         exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))
 
-        sub_model_1_description = (
-            self.extract_model_task_from_code(
-                (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()
-            )
-            + f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()}"""
-        )
-        sub_model_2_description = (
-            self.extract_model_task_from_code(
-                (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()
-            )
-            + f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()}"""
-        )
+        model_map = {
+            "XGBoost": "model_xgboost.py",
+            "RandomForest": "model_randomforest.py",
+            "LightGBM": "model_lightgbm.py",
+            "NN": "model_nn.py",
+        }
+
+        workspace_path = exp.experiment_workspace.workspace_path / "model"
+
+        for model_name, model_file in model_map.items():
+            model_file_path = workspace_path / model_file
 
-        exp.experiment_workspace.model_description["XGBoost"] = sub_model_1_description
-        exp.experiment_workspace.model_description["RandomForest"] = sub_model_2_description
+            if model_file_path.exists():
+                model_description = (
+                    self.extract_model_task_from_code(model_file_path.read_text())
+                    + f"""code: {model_file_path.read_text()}"""
+                )
+            else:
+                model_description = ""
+
+            exp.experiment_workspace.model_description[model_name] = model_description
 
         if RUNNER_SETTINGS.cache_result:
             self.dump_cache_result(exp, result)
@@ -120,51 +168,7 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:
 
 
 class KGFactorRunner(KGCachedRunner[KGFactorExperiment]):
-    def extract_model_task_from_code(self, code: str) -> str:
-        sys_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["extract_model_task_from_code"]["system"])
-            .render()
-        )
-
-        user_prompt = (
-            Environment(undefined=StrictUndefined)
-            .from_string(prompt_dict["extract_model_task_from_code"]["user"])
-            .render(file_content=code)
-        )
-
-        model_task_description = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt=user_prompt,
-            system_prompt=sys_prompt,
-            json_mode=True,
-        )
-
-        try:
-            response_json_analysis = json.loads(model_task_description)
-            task_desc = f"""name: {response_json_analysis['name']}
-        description: {response_json_analysis['description']}
-        """
-            task_desc += (
-                f"formulation: {response_json_analysis['formulation']}\n"
-                if response_json_analysis.get("formulation")
-                else ""
-            )
-            task_desc += f"architecture: {response_json_analysis['architecture']}\n"
-            task_desc += (
-                f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
-                if response_json_analysis.get("variables")
-                else ""
-            )
-            task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
-            task_desc += f"model_type: {response_json_analysis['model_type']}\n"
-        except json.JSONDecodeError:
-            task_desc = "Failed to parse LLM's response as JSON"
-
-        return task_desc
-
     def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
-        if exp.based_experiments and exp.based_experiments[-1].result is None:
-            exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
         current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
         implemented_factor_count = 0
         for sub_ws in exp.sub_workspace_list:
@@ -179,6 +183,10 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
         if implemented_factor_count == 0:
             raise FactorEmptyError("No factor is implemented")
 
+        # initial template result
+        if exp.based_experiments and exp.based_experiments[-1].result is None:
+            exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
+
         if RUNNER_SETTINGS.cache_result:
             cache_hit, result = self.get_cache_result(exp)
             if cache_hit:

diff --git a/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py b/...narios/kaggle/experiment/facebook-v-predicting-check-ins_template/fea_share_preprocess.py
@@ -0,0 +1,86 @@
+import os
+
+import numpy as np
+import pandas as pd
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import LabelEncoder
+
+
+def preprocess_data(df):
+    """Preprocess the data with feature engineering."""
+    # Convert time to more useful features
+    df["hour"] = df["time"] % 24
+    df["day"] = (df["time"] // 24) % 7
+    df["week"] = df["time"] // (24 * 7)
+
+    # Create distance from center feature
+    df["dist_from_center"] = np.sqrt(df["x"] ** 2 + df["y"] ** 2)
+
+    # Create accuracy bins
+    df["accuracy_bins"] = pd.cut(df["accuracy"], bins=5, labels=False)
+
+    # Create interaction features
+    df["xy"] = df["x"] * df["y"]
+    df["x_accuracy"] = df["x"] * df["accuracy"]
+    df["y_accuracy"] = df["y"] * df["accuracy"]
+
+    return df
+
+
+def preprocess_script():
+    """Main preprocessing function."""
+    if os.path.exists("/kaggle/input/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
+        others = pd.read_pickle("/kaggle/input/others.pkl")
+        return X_train, X_valid, y_train, y_valid, X_test, *others
+
+    # Load the training data
+    train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
+    test_df = pd.read_csv("/kaggle/input/test.csv").head(1000)
+
+    # Preprocess the data
+    train_df = preprocess_data(train_df)
+    test_df = preprocess_data(test_df)
+
+    # Encode place_ids
+    place_id_encoder = LabelEncoder()
+    place_id_encoder.fit(train_df["place_id"])
+    train_df["place_id"] = place_id_encoder.transform(train_df["place_id"])
+
+    # Split features and target for training data
+    X = train_df.drop(["place_id"], axis=1)
+    y = train_df["place_id"]
+
+    # Prepare test data
+    test_row_ids = test_df["row_id"]
+    X_test = test_df.drop(["row_id"], axis=1)
+
+    # Ensure X_test has the same columns as X
+    for col in X.columns:
+        if col not in X_test.columns:
+            X_test[col] = 0  # or some other appropriate default value
+
+    X_test = X_test[X.columns]  # Reorder columns to match X
+
+    # Attempt stratified split, fall back to random split if necessary
+    try:
+        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
+    except ValueError:
+        print("Warning: Stratified split not possible. Falling back to random split.")
+        X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
+
+    # Handle missing values
+    imputer = SimpleImputer(strategy="mean")
+    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
+    X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
+    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
+
+    # Count the number of unique classes
+    n_classes = len(place_id_encoder.classes_)
+
+    return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes
diff --git a/...t/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/feature/feature.py b/...t/scenarios/kaggle/experiment/facebook-v-predicting-check-ins_template/feature/feature.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
+
+
+class IdentityFeature:
+    def fit(self, train_df: pd.DataFrame):
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/...os/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py b/...os/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_randomforest.py
@@ -0,0 +1,33 @@
+"""
+Motivation of the model:
+The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
+It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
+baseline model for many classification tasks.
+"""
+
+import pandas as pd
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.metrics import accuracy_score
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """
+    Define and train the Random Forest model. Merge feature selection into the pipeline.
+    """
+    # Initialize the Random Forest model
+    model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)
+
+    # Fit the model
+    model.fit(X_train, y_train)
+
+    return model
+
+
+def predict(model, X):
+    """
+    Keep feature selection's consistency and make predictions.
+    """
+    # Predict using the trained model
+    y_pred = model.predict(X)
+
+    return y_pred.reshape(-1, 1)
diff --git a/...enarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py b/...enarios/kaggle/experiment/facebook-v-predicting-check-ins_template/model/model_xgboost.py
@@ -0,0 +1,51 @@
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from sklearn.preprocessing import LabelEncoder
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """Define and train the model. Merge feature_select"""
+
+    # Combine train and valid labels to get all unique labels
+    all_labels = np.unique(np.concatenate([y_train, y_valid]))
+    le = LabelEncoder().fit(all_labels)
+
+    # Encode labels
+    y_train_encoded = le.transform(y_train)
+    y_valid_encoded = le.transform(y_valid)
+
+    dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
+    dvalid = xgb.DMatrix(X_valid, label=y_valid_encoded)
+    num_classes = len(le.classes_)
+
+    params = {
+        "objective": "multi:softprob",
+        "num_class": num_classes,
+        "max_depth": 6,
+        "eta": 0.3,
+        "subsample": 0.8,
+        "colsample_bytree": 0.8,
+        "min_child_weight": 1,
+        "nthread": -1,
+    }
+    num_round = 100
+
+    evallist = [(dtrain, "train"), (dvalid, "eval")]
+    bst = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=10)
+
+    # Store the LabelEncoder in the model for later use in prediction
+    bst.le = le
+
+    return bst
+
+
+def predict(model, X):
+    """
+    Keep feature select's consistency.
+    """
+    dtest = xgb.DMatrix(X)
+    y_pred_prob = model.predict(dtest)
+    # Convert probabilities back to original labels if needed
+    # y_pred_labels = model.le.inverse_transform(y_pred_prob.argmax(axis=1))
+    return y_pred_prob