feat: new-york-city-taxi-fare-prediction_template (#488)

* copy init version * feat: new-york-city-taxi-fare-prediction_template * add move to linear model * Add more details about docker * auto lint * auto lint with new black
microsoft · Nov 15, 2024 · a9caab7 · a9caab7
1 parent f6c522b
commit a9caab7
Show file tree

Hide file tree

Showing 15 changed files with 339 additions and 4 deletions.
diff --git a/rdagent/components/proposal/__init__.py b/rdagent/components/proposal/__init__.py
@@ -19,7 +19,6 @@
 
 
 class LLMHypothesisGen(HypothesisGen):
-
     def __init__(self, scen: Scenario):
         super().__init__(scen)
 

diff --git a/...ios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/fea_share_preprocess.py b/...ios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/fea_share_preprocess.py
@@ -0,0 +1,75 @@
+import os
+
+import numpy as np
+import pandas as pd
+from sklearn.impute import SimpleImputer
+from sklearn.model_selection import train_test_split
+
+index_name = "key"
+label_name = "fare_amount"
+
+
+def prepreprocess():
+    """
+    This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
+    """
+    # Load and preprocess the data
+    data_df = pd.read_csv("/kaggle/input/train.csv")
+    data_df = data_df.drop([index_name], axis=1)
+
+    X = data_df.drop([label_name], axis=1)
+    y = data_df[label_name]
+
+    # Split the data into training and validation sets
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
+
+    return X_train, X_valid, y_train, y_valid
+
+
+def preprocess_script():
+    """
+    This method applies the preprocessing steps to the training, validation, and test datasets.
+    """
+    if os.path.exists("/kaggle/input/X_train.pkl"):
+        X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
+        X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
+        y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
+        y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
+        X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
+        others = pd.read_pickle("/kaggle/input/others.pkl")
+
+        return X_train, X_valid, y_train, y_valid, X_test, *others
+
+    X_train, X_valid, y_train, y_valid = prepreprocess()
+
+    # Load and preprocess the test data
+    submission_df = pd.read_csv("/kaggle/input/test.csv")
+    ids = submission_df[index_name]
+    X_test = submission_df.drop([index_name], axis=1)
+
+    return X_train, X_valid, y_train, y_valid, X_test, ids
+
+
+def clean_and_impute_data(X_train, X_valid, X_test):
+    """
+    Handles inf and -inf values by replacing them with NaN,
+    then imputes missing values using the mean strategy.
+    Also removes duplicate columns.
+    """
+    # Replace inf and -inf with NaN
+    X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
+    X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
+    X_test.replace([np.inf, -np.inf], np.nan, inplace=True)
+
+    # Impute missing values
+    imputer = SimpleImputer(strategy="mean")
+    X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
+    X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
+    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
+
+    # Remove duplicate columns
+    X_train = X_train.loc[:, ~X_train.columns.duplicated()]
+    X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
+    X_test = X_test.loc[:, ~X_test.columns.duplicated()]
+
+    return X_train, X_valid, X_test
diff --git a/...cenarios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/feature/feature.py b/...cenarios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/feature/feature.py
@@ -0,0 +1,30 @@
+import pandas as pd
+
+"""
+Here is the feature engineering code for each task, with a class that has a fit and transform method.
+Remember
+"""
+
+
+class DatetimeFeature:
+    def fit(self, train_df: pd.DataFrame):
+        """
+        Fit the feature engineering model to the training data.
+        """
+        pass
+
+    def transform(self, X: pd.DataFrame):
+        """
+        Transform the input data.
+        """
+        X["pickup_datetime"] = pd.to_datetime(X["pickup_datetime"], format="%Y-%m-%d %H:%M:%S UTC")
+        X["hour"] = X.pickup_datetime.dt.hour
+        X["day"] = X.pickup_datetime.dt.day
+        X["month"] = X.pickup_datetime.dt.month
+        X["weekday"] = X.pickup_datetime.dt.weekday
+        X["year"] = X.pickup_datetime.dt.year
+        X.drop(columns=["pickup_datetime"], inplace=True)
+        return X
+
+
+feature_engineering_cls = DatetimeFeature
diff --git a/...arios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/model_linear.py b/...arios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/model_linear.py
@@ -0,0 +1,38 @@
+"""
+Motivation of the model:
+The Linear Regression model is chosen for its simplicity and interpretability. It is a good starting point for regression tasks
+and provides a baseline to compare more complex models against. Linear Regression assumes a linear relationship between the 
+features and the target variable, which can be a reasonable assumption for many problems.
+"""
+
+import pandas as pd
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import mean_squared_error
+
+
+def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
+    """
+    Define and train the Linear Regression model. Merge feature selection into the pipeline.
+    """
+    # Initialize the Linear Regression model
+    model = LinearRegression()
+
+    # Fit the model
+    model.fit(X_train, y_train)
+
+    # Validate the model
+    y_valid_pred = model.predict(X_valid)
+    mse = mean_squared_error(y_valid, y_valid_pred)
+    print(f"Validation Mean Squared Error: {mse:.4f}")
+
+    return model
+
+
+def predict(model, X):
+    """
+    Keep feature selection's consistency and make predictions.
+    """
+    # Predict using the trained model
+    y_pred = model.predict(X)
+
+    return y_pred.reshape(-1, 1)
diff --git a/...os/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_lightgbm.py b/...os/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_lightgbm.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/...rios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_linear.py b/...rios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_linear.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/...cenarios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_nn.py b/...cenarios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_nn.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/...aggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_randomforest.py b/...aggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_randomforest.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/...ios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_xgboost.py b/...ios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_xgboost.py
@@ -0,0 +1,12 @@
+import pandas as pd
+
+
+def select(X: pd.DataFrame) -> pd.DataFrame:
+    """
+    Select relevant features. To be used in fit & predict function.
+    """
+    # For now, we assume all features are relevant. This can be expanded to feature selection logic.
+    if X.columns.nlevels == 1:
+        return X
+    X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
+    return X
diff --git a/rdagent/scenarios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/train.py b/rdagent/scenarios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/train.py
@@ -0,0 +1,91 @@
+import importlib.util
+import random
+from pathlib import Path
+
+import numpy as np
+import pandas as pd
+from fea_share_preprocess import clean_and_impute_data, preprocess_script
+from sklearn.metrics import matthews_corrcoef, root_mean_squared_error
+
+# Set random seed for reproducibility
+SEED = 42
+random.seed(SEED)
+np.random.seed(SEED)
+DIRNAME = Path(__file__).absolute().resolve().parent
+
+
+def compute_metrics_for_classification(y_true, y_pred):
+    """Compute MCC for classification."""
+    mcc = matthews_corrcoef(y_true, y_pred)
+    return mcc
+
+
+def import_module_from_path(module_name, module_path):
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    return module
+
+
+# 1) Preprocess the data
+X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
+
+# 2) Auto feature engineering
+X_train_l, X_valid_l = [], []
+X_test_l = []
+
+for f in DIRNAME.glob("feature/feat*.py"):
+    cls = import_module_from_path(f.stem, f).feature_engineering_cls()
+    cls.fit(X_train)
+    X_train_f = cls.transform(X_train)
+    X_valid_f = cls.transform(X_valid)
+    X_test_f = cls.transform(X_test)
+
+    if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
+        X_train_l.append(X_train_f)
+        X_valid_l.append(X_valid_f)
+        X_test_l.append(X_test_f)
+
+X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
+X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
+X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])
+
+print(X_train.shape, X_valid.shape, X_test.shape)
+
+# Handle inf and -inf values
+X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)
+
+
+model_l = []  # list[tuple[model, predict_func]]
+for f in DIRNAME.glob("model/model*.py"):
+    select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
+    select_m = import_module_from_path(select_python_path.stem, select_python_path)
+    X_train_selected = select_m.select(X_train.copy())
+    X_valid_selected = select_m.select(X_valid.copy())
+
+    m = import_module_from_path(f.stem, f)
+    model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))
+
+# 4) Evaluate the model on the validation set
+metrics_all = []
+for model, predict_func, select_m in model_l:
+    X_valid_selected = select_m.select(X_valid.copy())
+    y_valid_pred = predict_func(model, X_valid_selected)
+    rmse = root_mean_squared_error(y_valid, y_valid_pred)
+    print(f"final root mean squared error on valid set: {rmse}")
+    metrics_all.append(rmse)
+
+# 5) Save the validation accuracy
+min_index = np.argmin(metrics_all)
+pd.Series(data=[metrics_all[min_index]], index=["root mean squared error"]).to_csv("submission_score.csv")
+
+# 6) Make predictions on the test set and save them
+X_test_selected = model_l[min_index][2].select(X_test.copy())
+y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected).flatten() + 1
+
+
+# 7) Submit predictions for the test set
+submission_result = pd.DataFrame(y_test_pred, columns=["fare_amount"])
+submission_result.insert(0, "key", ids)
+
+submission_result.to_csv("submission.csv", index=False)
diff --git a/...ios/kaggle/experiment/tabular-playground-series-dec-2021_template/fea_share_preprocess.py b/...ios/kaggle/experiment/tabular-playground-series-dec-2021_template/fea_share_preprocess.py
@@ -6,6 +6,25 @@
 from sklearn.model_selection import train_test_split
 from sklearn.preprocessing import LabelEncoder
 
+index_col_name = "key"
+
+
+def prepreprocess():
+    """
+    This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
+    """
+    # Load and preprocess the data
+    data_df = pd.read_csv("/kaggle/input/train.csv")
+    data_df = data_df.drop(["Id"], axis=1)
+
+    X = data_df.drop(["Cover_Type"], axis=1)
+    y = data_df["Cover_Type"] - 1
+
+    # Split the data into training and validation sets
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
+
+    return X_train, X_valid, y_train, y_valid
+
 
 def preprocess_script():
     """

diff --git a/rdagent/scenarios/kaggle/experiment/workspace.py b/rdagent/scenarios/kaggle/experiment/workspace.py
@@ -26,10 +26,11 @@
 
 
 class KGFBWorkspace(FBWorkspace):
-    def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
+    def __init__(self, template_folder_path: Path, *args, entry="python train.py", **kwargs) -> None:
         super().__init__(*args, **kwargs)
         self.inject_code_from_folder(template_folder_path)
         self.data_description: List[Tuple[str, int]] = []
+        self.entry = entry  # this is for debugging (you may want to change it into `sleep 1000`)
 
     @property
     def model_description(self) -> dict[str, str]:
@@ -85,7 +86,7 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str:
 
         execute_log = kgde.run(
             local_path=str(self.workspace_path),
-            entry=f"python train.py",
+            entry=self.entry,
             env=run_env,
             running_extra_volume=running_extra_volume,
         )

diff --git a/rdagent/scenarios/kaggle/kaggle_crawler.py b/rdagent/scenarios/kaggle/kaggle_crawler.py
@@ -1,6 +1,7 @@
 # %%
 import bisect
 import json
+import shutil
 import subprocess
 import time
 import zipfile
@@ -124,7 +125,18 @@ def download_data(competition: str, local_path: str = KAGGLE_IMPLEMENT_SETTING.l
                 f"/bin/sh -c 'cp -r ./zip_files/{competition}/prepared/private/test.csv ./{competition}/valid.csv'",
                 local_path=local_path,
             )
-
+            # NOTE:
+            # Patching:  due to mle has special renaming mechanism for different competition;
+            # We have to switch the schema back to a uniform one;
+            if competition in {"new-york-city-taxi-fare-prediction"}:
+                cpath = Path(local_path) / f"{competition}"
+                labels_path = cpath / "labels.csv"
+                train_path = cpath / "train.csv"
+                if labels_path.exists():
+                    shutil.copy(labels_path, train_path)
+                else:
+                    logger.error(f"labels.csv not found in {cpath}")
+                    raise FileNotFoundError(f"{labels_path} does not exist")
     else:
         zipfile_path = f"{local_path}/zip_files"
         if not Path(f"{zipfile_path}/{competition}.zip").exists():