Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: supporting Facebook competition (don't merge now) #364

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,14 @@
[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit)
[![Checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](http://mypy-lang.org/)
[![Ruff](https://img.shields.io/endpoint?url=https://raw.githubusercontent.com/astral-sh/ruff/main/assets/badge/v2.json)](https://github.com/astral-sh/ruff)
[![Chat](https://img.shields.io/badge/chat-discord-blue)](https://discord.gg/ybQ97B6Jjy)
[![Readthedocs Preview](https://github.com/microsoft/RD-Agent/actions/workflows/readthedocs-preview.yml/badge.svg)](https://github.com/microsoft/RD-Agent/actions/workflows/readthedocs-preview.yml) <!-- this badge is too long, please place it in the last one to make it pretty -->

# 📰 News
| 🗞️ News | 📝 Description |
| -- | ------ |
| Official WeChat group release | We created a WeChat group, welcome to join! (🗪[QR Code](docs/WeChat_QR_code.jpg)) |
| Official Discord release | We launch our first chatting channel in Discord (🗪[![Chat](https://img.shields.io/badge/chat-discord-blue)](https://discord.gg/ybQ97B6Jjy)) |
| First release | **RDAgent** is released on GitHub |


Expand Down
Binary file added docs/WeChat_QR_code.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
7 changes: 4 additions & 3 deletions rdagent/log/ui/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -443,9 +443,10 @@ def tasks_window(tasks: list[FactorTask | ModelTask]):
st.latex(ft.factor_formulation)

mks = "| Variable | Description |\n| --- | --- |\n"
for v, d in ft.variables.items():
mks += f"| ${v}$ | {d} |\n"
st.markdown(mks)
if isinstance(ft.variables, dict):
for v, d in ft.variables.items():
mks += f"| ${v}$ | {d} |\n"
st.markdown(mks)

elif isinstance(tasks[0], ModelTask):
st.markdown("**Model Tasks🚩**")
Expand Down
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/developer/feedback.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def generate_feedback(self, exp: Experiment, hypothesis: Hypothesis, trace: Trac
self.scen.vector_base.add_experience_to_vector_base(experiment_feedback)
self.scen.vector_base.save()
elif self.scen.if_using_graph_rag:
self.scen.trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)
trace.knowledge_base.load_from_documents([experiment_feedback], self.scen)

return HypothesisFeedback(
observations=observations,
Expand Down
124 changes: 66 additions & 58 deletions rdagent/scenarios/kaggle/developer/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,48 @@ def get_cache_key(self, exp: ASpecificExp) -> str:
codes = "\n".join(codes)
return md5_hash(codes)

def extract_model_task_from_code(self, code: str) -> str:
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
.render(file_content=code)
)

model_task_description = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(model_task_description)
task_desc = f"""name: {response_json_analysis['name']}
description: {response_json_analysis['description']}
"""
task_desc += (
f"formulation: {response_json_analysis['formulation']}\n"
if response_json_analysis.get("formulation")
else ""
)
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
task_desc += (
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
if response_json_analysis.get("variables")
else ""
)
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
except json.JSONDecodeError:
task_desc = "Failed to parse LLM's response as JSON"

return task_desc

def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorExperiment | KGModelExperiment:
"""
For the initial development, the experiment serves as a benchmark for feature engineering.
Expand Down Expand Up @@ -59,21 +101,27 @@ def init_develop(self, exp: KGFactorExperiment | KGModelExperiment) -> KGFactorE
feature_shape = org_data.shape[-1]
exp.experiment_workspace.data_description.append((sub_task.get_task_information(), feature_shape))

sub_model_1_description = (
self.extract_model_task_from_code(
(exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()
)
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_randomforest.py").read_text()}"""
)
sub_model_2_description = (
self.extract_model_task_from_code(
(exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()
)
+ f"""code: { (exp.experiment_workspace.workspace_path / "model" / "model_xgboost.py").read_text()}"""
)
model_map = {
"XGBoost": "model_xgboost.py",
"RandomForest": "model_randomforest.py",
"LightGBM": "model_lightgbm.py",
"NN": "model_nn.py",
}

workspace_path = exp.experiment_workspace.workspace_path / "model"

for model_name, model_file in model_map.items():
model_file_path = workspace_path / model_file

exp.experiment_workspace.model_description["XGBoost"] = sub_model_1_description
exp.experiment_workspace.model_description["RandomForest"] = sub_model_2_description
if model_file_path.exists():
model_description = (
self.extract_model_task_from_code(model_file_path.read_text())
+ f"""code: {model_file_path.read_text()}"""
)
else:
model_description = ""

exp.experiment_workspace.model_description[model_name] = model_description

if RUNNER_SETTINGS.cache_result:
self.dump_cache_result(exp, result)
Expand Down Expand Up @@ -120,51 +168,7 @@ def develop(self, exp: KGModelExperiment) -> KGModelExperiment:


class KGFactorRunner(KGCachedRunner[KGFactorExperiment]):
def extract_model_task_from_code(self, code: str) -> str:
sys_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["system"])
.render()
)

user_prompt = (
Environment(undefined=StrictUndefined)
.from_string(prompt_dict["extract_model_task_from_code"]["user"])
.render(file_content=code)
)

model_task_description = APIBackend().build_messages_and_create_chat_completion(
user_prompt=user_prompt,
system_prompt=sys_prompt,
json_mode=True,
)

try:
response_json_analysis = json.loads(model_task_description)
task_desc = f"""name: {response_json_analysis['name']}
description: {response_json_analysis['description']}
"""
task_desc += (
f"formulation: {response_json_analysis['formulation']}\n"
if response_json_analysis.get("formulation")
else ""
)
task_desc += f"architecture: {response_json_analysis['architecture']}\n"
task_desc += (
f"variables: {json.dumps(response_json_analysis['variables'], indent=4)}\n"
if response_json_analysis.get("variables")
else ""
)
task_desc += f"hyperparameters: {json.dumps(response_json_analysis['hyperparameters'], indent=4)}\n"
task_desc += f"model_type: {response_json_analysis['model_type']}\n"
except json.JSONDecodeError:
task_desc = "Failed to parse LLM's response as JSON"

return task_desc

def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])
current_feature_file_count = len(list(exp.experiment_workspace.workspace_path.glob("feature/feature*.py")))
implemented_factor_count = 0
for sub_ws in exp.sub_workspace_list:
Expand All @@ -179,6 +183,10 @@ def develop(self, exp: KGFactorExperiment) -> KGFactorExperiment:
if implemented_factor_count == 0:
raise FactorEmptyError("No factor is implemented")

# initial template result
if exp.based_experiments and exp.based_experiments[-1].result is None:
exp.based_experiments[-1] = self.init_develop(exp.based_experiments[-1])

if RUNNER_SETTINGS.cache_result:
cache_hit, result = self.get_cache_result(exp)
if cache_hit:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
import os

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


def preprocess_data(df):
"""Preprocess the data with feature engineering."""
# Convert time to more useful features
df["hour"] = df["time"] % 24
df["day"] = (df["time"] // 24) % 7
df["week"] = df["time"] // (24 * 7)

# Create distance from center feature
df["dist_from_center"] = np.sqrt(df["x"] ** 2 + df["y"] ** 2)

# Create accuracy bins
df["accuracy_bins"] = pd.cut(df["accuracy"], bins=5, labels=False)

# Create interaction features
df["xy"] = df["x"] * df["y"]
df["x_accuracy"] = df["x"] * df["accuracy"]
df["y_accuracy"] = df["y"] * df["accuracy"]

return df


def preprocess_script():
"""Main preprocessing function."""
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")
return X_train, X_valid, y_train, y_valid, X_test, *others

# Load the training data
train_df = pd.read_csv("/kaggle/input/train.csv").head(1000)
test_df = pd.read_csv("/kaggle/input/test.csv").head(1000)

# Preprocess the data
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)

# Encode place_ids
place_id_encoder = LabelEncoder()
place_id_encoder.fit(train_df["place_id"])
train_df["place_id"] = place_id_encoder.transform(train_df["place_id"])

# Split features and target for training data
X = train_df.drop(["place_id"], axis=1)
y = train_df["place_id"]

# Prepare test data
test_row_ids = test_df["row_id"]
X_test = test_df.drop(["row_id"], axis=1)

# Ensure X_test has the same columns as X
for col in X.columns:
if col not in X_test.columns:
X_test[col] = 0 # or some other appropriate default value

X_test = X_test[X.columns] # Reorder columns to match X

# Attempt stratified split, fall back to random split if necessary
try:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
except ValueError:
print("Warning: Stratified split not possible. Falling back to random split.")
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Handle missing values
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Count the number of unique classes
n_classes = len(place_id_encoder.classes_)

return X_train, X_valid, y_train, y_valid, X_test, place_id_encoder, test_row_ids, n_classes
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
"""
Motivation of the model:
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
baseline model for many classification tasks.
"""

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)

# Fit the model
model.fit(X_train, y_train)

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Predict using the trained model
y_pred = model.predict(X)

return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""Define and train the model. Merge feature_select"""

# Combine train and valid labels to get all unique labels
all_labels = np.unique(np.concatenate([y_train, y_valid]))
le = LabelEncoder().fit(all_labels)

# Encode labels
y_train_encoded = le.transform(y_train)
y_valid_encoded = le.transform(y_valid)

dtrain = xgb.DMatrix(X_train, label=y_train_encoded)
dvalid = xgb.DMatrix(X_valid, label=y_valid_encoded)
num_classes = len(le.classes_)

params = {
"objective": "multi:softprob",
"num_class": num_classes,
"max_depth": 6,
"eta": 0.3,
"subsample": 0.8,
"colsample_bytree": 0.8,
"min_child_weight": 1,
"nthread": -1,
}
num_round = 100

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist, early_stopping_rounds=10)

# Store the LabelEncoder in the model for later use in prediction
bst.le = le

return bst


def predict(model, X):
"""
Keep feature select's consistency.
"""
dtest = xgb.DMatrix(X)
y_pred_prob = model.predict(dtest)
# Convert probabilities back to original labels if needed
# y_pred_labels = model.le.inverse_transform(y_pred_prob.argmax(axis=1))
return y_pred_prob
Loading