add kaggle tpl (#482)

microsoft · Nov 11, 2024 · 3ddba41 · 3ddba41
1 parent f3405ca
commit 3ddba41
Show file tree

Hide file tree

Showing 10 changed files with 259 additions and 3 deletions.
diff --git a/...kaggle/experiment/tabular-playground-series-may-2022_template/model/model_randomforest.py b/...kaggle/experiment/tabular-playground-series-may-2022_template/model/model_randomforest.py
@@ -7,7 +7,7 @@
 
 import pandas as pd
 from sklearn.ensemble import RandomForestClassifier
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import roc_auc_score
 
 
 def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
@@ -22,8 +22,8 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali
 
     # Validate the model
     y_valid_pred = model.predict(X_valid)
-    accuracy = accuracy_score(y_valid, y_valid_pred)
-    print(f"Validation Accuracy: {accuracy:.4f}")
+    auroc = roc_auc_score(y_valid, y_valid_pred)
+    print(f"Validation AUROC: {auroc:.4f}")
 
     return model
 

diff --git a/...rios/kaggle/experiment/tabular-playground-series-may-2022_template/model/model_xgboost.py b/...rios/kaggle/experiment/tabular-playground-series-may-2022_template/model/model_xgboost.py
@@ -18,6 +18,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
         "device": "cuda",
         "tree_method": "hist",
         "objective": "binary:logistic",
+        "eval_metric": "auc",
     }
     num_boost_round = 10
 

diff --git a/...enarios/kaggle/experiment/ventilator-pressure-prediction_template/fea_share_preprocess.py b/...enarios/kaggle/experiment/ventilator-pressure-prediction_template/fea_share_preprocess.py
@@ -0,0 +1,34 @@
+import os
+
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import MinMaxScaler
+
+
+def preprocess_script():
+    """
+    This method applies the preprocessing steps to the training, validation, and test datasets.
+    """
+    if os.path.exists("/kaggle/input/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
+        others = pd.read_pickle("/kaggle/input/others.pkl")
+
+        return X_train, X_valid, y_train, y_valid, X_test, *others
+
+    train_df = pd.read_csv("/kaggle/input/train.csv")
+    test_df = pd.read_csv("/kaggle/input/test.csv")
+
+    X = train_df.drop(["pressure", "id"], axis=1)
+    y = train_df["pressure"]
+
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)
+
+    # Load and preprocess the test data
+    ids = test_df["id"]
+    X_test = test_df.drop(["id"], axis=1)
+
+    return X_train, X_valid, y_train, y_valid, X_test, ids
diff --git a/...nt/scenarios/kaggle/experiment/ventilator-pressure-prediction_template/feature/feature.py b/...nt/scenarios/kaggle/experiment/ventilator-pressure-prediction_template/feature/feature.py
@@ -0,0 +1,23 @@
+import pandas as pd
+
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
+
+
+class IdentityFeature:
+    def fit(self, train_df: pd.DataFrame):
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Transform the input data.
+        """
+        return X
+
+
+feature_engineering_cls = IdentityFeature
diff --git a/...ios/kaggle/experiment/ventilator-pressure-prediction_template/model/model_randomforest.py b/...ios/kaggle/experiment/ventilator-pressure-prediction_template/model/model_randomforest.py
@@ -0,0 +1,40 @@
+"""
+Motivation of the model:
+The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
+It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
+baseline model for many classification tasks.
+"""
+
+import pandas as pd
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.metrics import mean_absolute_error
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """
+    Define and train the Random Forest model. Merge feature selection into the pipeline.
+    """
+    # Initialize the Random Forest model
+    model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1)
+
+    # Fit the model
+    model.fit(X_train, y_train)
+
+    # Predict on the validation set
+    y_valid_pred = model.predict(X_valid)
+
+    # Calculate the mean absolute error on the validation set
+    mae = mean_absolute_error(y_valid, y_valid_pred)
+    print(f"Validation MAE of RandomForestRegressor: {mae}")
+
+    return model
+
+
+def predict(model, X):
+    """
+    Keep feature selection's consistency and make predictions.
+    """
+    # Predict using the trained model
+    y_pred = model.predict(X)
+
+    return y_pred.reshape(-1, 1)
diff --git a/...cenarios/kaggle/experiment/ventilator-pressure-prediction_template/model/model_xgboost.py b/...cenarios/kaggle/experiment/ventilator-pressure-prediction_template/model/model_xgboost.py
@@ -0,0 +1,40 @@
+"""
+motivation  of the model
+"""
+
+import pandas as pd
+import xgboost as xgb
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> xgb.Booster:
+    """Define and train the model. Merge feature_select"""
+    # 将数据转换为 DMatrix 并指定设备
+    dtrain = xgb.DMatrix(X_train, label=y_train)
+    dvalid = xgb.DMatrix(X_valid, label=y_valid)
+
+    params = {
+        "learning_rate": 0.1,
+        "subsample": 0.95,
+        "colsample_bytree": 0.11,
+        "max_depth": 2,
+        "booster": "gbtree",
+        "reg_lambda": 66.1,
+        "reg_alpha": 15.9,
+        "random_state": 42,
+        "tree_method": "hist",
+        "device": "cuda",
+        "eval_metric": "mae",
+    }
+    num_boost_round = 1000
+
+    model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dvalid, "validation")], verbose_eval=100)
+    return model
+
+
+def predict(model: xgb.Booster, X):
+    """
+    Keep feature select's consistency.
+    """
+    dtest = xgb.DMatrix(X)
+    y_pred = model.predict(dtest)
+    return y_pred
diff --git a/...nt/scenarios/kaggle/experiment/ventilator-pressure-prediction_template/model/select_nn.py b/...nt/scenarios/kaggle/experiment/ventilator-pressure-prediction_template/model/select_nn.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/...os/kaggle/experiment/ventilator-pressure-prediction_template/model/select_randomforest.py b/...os/kaggle/experiment/ventilator-pressure-prediction_template/model/select_randomforest.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/...enarios/kaggle/experiment/ventilator-pressure-prediction_template/model/select_xgboost.py b/...enarios/kaggle/experiment/ventilator-pressure-prediction_template/model/select_xgboost.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/rdagent/scenarios/kaggle/experiment/ventilator-pressure-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/ventilator-pressure-prediction_template/train.py
@@ -0,0 +1,82 @@
+import importlib.util
+import random
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from fea_share_preprocess import preprocess_script
+from sklearn.metrics import mean_absolute_error
+
+# Set random seed for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+# 1) Preprocess the data
+X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
+
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
+    X_test_f = cls.transform(X_test)
+
+    if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
+        X_train_l.append(X_train_f)
+        X_valid_l.append(X_valid_f)
+        X_test_l.append(X_test_f)
+        print(f"Feature [{f.stem}] has been added to the feature list")
+
+X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
+X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
+X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
+
+
+model_l = []  # list[tuple[model, predict_func]]
+for f in DIRNAME.glob("model/model*.py"):
+    select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
+    select_m = import_module_from_path(select_python_path.stem, select_python_path)
+    X_train_selected = select_m.select(X_train.copy())
+    X_valid_selected = select_m.select(X_valid.copy())
+
+    m = import_module_from_path(f.stem, f)
+    model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))
+    print(f"Model [{f.stem}] has been trained")
+
+# 4) Evaluate the model on the validation set
+metrics_all = []
+for model, predict_func, select_m in model_l:
+    X_valid_selected = select_m.select(X_valid.copy())
+    y_valid_pred = predict_func(model, X_valid_selected)
+    mae = mean_absolute_error(y_valid, y_valid_pred)
+    print(f"[{type(model).__name__}] MAE on valid set: {mae}")
+    metrics_all.append(mae)
+
+# 5) Save the validation accuracy
+max_index = np.argmin(metrics_all)
+pd.Series(data=[metrics_all[max_index]], index=["MAE"]).to_csv("submission_score.csv")
+
+# 6) Make predictions on the test set and save them
+X_test_selected = model_l[max_index][2].select(X_test.copy())
+y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1
+
+
+# 7) Submit predictions for the test set
+submission_result = pd.DataFrame(y_test_pred, columns=["pressure"])
+submission_result.insert(0, "id", ids)
+
+submission_result.to_csv("submission.csv", index=False)