Skip to content

Commit

Permalink
feat: new-york-city-taxi-fare-prediction_template (#488)
Browse files Browse the repository at this point in the history
* copy init version

* feat: new-york-city-taxi-fare-prediction_template

* add move to linear model

* Add more details about docker

* auto lint

* auto lint with new black
  • Loading branch information
you-n-g authored Nov 15, 2024
1 parent f6c522b commit a9caab7
Show file tree
Hide file tree
Showing 15 changed files with 339 additions and 4 deletions.
1 change: 0 additions & 1 deletion rdagent/components/proposal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@


class LLMHypothesisGen(HypothesisGen):

def __init__(self, scen: Scenario):
super().__init__(scen)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import os

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

index_name = "key"
label_name = "fare_amount"


def prepreprocess():
"""
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
"""
# Load and preprocess the data
data_df = pd.read_csv("/kaggle/input/train.csv")
data_df = data_df.drop([index_name], axis=1)

X = data_df.drop([label_name], axis=1)
y = data_df[label_name]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

return X_train, X_valid, y_train, y_valid


def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, *others

X_train, X_valid, y_train, y_valid = prepreprocess()

# Load and preprocess the test data
submission_df = pd.read_csv("/kaggle/input/test.csv")
ids = submission_df[index_name]
X_test = submission_df.drop([index_name], axis=1)

return X_train, X_valid, y_train, y_valid, X_test, ids


def clean_and_impute_data(X_train, X_valid, X_test):
"""
Handles inf and -inf values by replacing them with NaN,
then imputes missing values using the mean strategy.
Also removes duplicate columns.
"""
# Replace inf and -inf with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Remove duplicate columns
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

return X_train, X_valid, X_test
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class DatetimeFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
X["pickup_datetime"] = pd.to_datetime(X["pickup_datetime"], format="%Y-%m-%d %H:%M:%S UTC")
X["hour"] = X.pickup_datetime.dt.hour
X["day"] = X.pickup_datetime.dt.day
X["month"] = X.pickup_datetime.dt.month
X["weekday"] = X.pickup_datetime.dt.weekday
X["year"] = X.pickup_datetime.dt.year
X.drop(columns=["pickup_datetime"], inplace=True)
return X


feature_engineering_cls = DatetimeFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
Motivation of the model:
The Linear Regression model is chosen for its simplicity and interpretability. It is a good starting point for regression tasks
and provides a baseline to compare more complex models against. Linear Regression assumes a linear relationship between the
features and the target variable, which can be a reasonable assumption for many problems.
"""

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Linear Regression model. Merge feature selection into the pipeline.
"""
# Initialize the Linear Regression model
model = LinearRegression()

# Fit the model
model.fit(X_train, y_train)

# Validate the model
y_valid_pred = model.predict(X_valid)
mse = mean_squared_error(y_valid, y_valid_pred)
print(f"Validation Mean Squared Error: {mse:.4f}")

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Predict using the trained model
y_pred = model.predict(X)

return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import importlib.util
import random
from pathlib import Path

import numpy as np
import pandas as pd
from fea_share_preprocess import clean_and_impute_data, preprocess_script
from sklearn.metrics import matthews_corrcoef, root_mean_squared_error

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent


def compute_metrics_for_classification(y_true, y_pred):
"""Compute MCC for classification."""
mcc = matthews_corrcoef(y_true, y_pred)
return mcc


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
X_test_f = cls.transform(X_test)

if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)
X_test_l.append(X_test_f)

X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])

print(X_train.shape, X_valid.shape, X_test.shape)

# Handle inf and -inf values
X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)


model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
select_m = import_module_from_path(select_python_path.stem, select_python_path)
X_train_selected = select_m.select(X_train.copy())
X_valid_selected = select_m.select(X_valid.copy())

m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))

# 4) Evaluate the model on the validation set
metrics_all = []
for model, predict_func, select_m in model_l:
X_valid_selected = select_m.select(X_valid.copy())
y_valid_pred = predict_func(model, X_valid_selected)
rmse = root_mean_squared_error(y_valid, y_valid_pred)
print(f"final root mean squared error on valid set: {rmse}")
metrics_all.append(rmse)

# 5) Save the validation accuracy
min_index = np.argmin(metrics_all)
pd.Series(data=[metrics_all[min_index]], index=["root mean squared error"]).to_csv("submission_score.csv")

# 6) Make predictions on the test set and save them
X_test_selected = model_l[min_index][2].select(X_test.copy())
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected).flatten() + 1


# 7) Submit predictions for the test set
submission_result = pd.DataFrame(y_test_pred, columns=["fare_amount"])
submission_result.insert(0, "key", ids)

submission_result.to_csv("submission.csv", index=False)
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,25 @@
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

index_col_name = "key"


def prepreprocess():
"""
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
"""
# Load and preprocess the data
data_df = pd.read_csv("/kaggle/input/train.csv")
data_df = data_df.drop(["Id"], axis=1)

X = data_df.drop(["Cover_Type"], axis=1)
y = data_df["Cover_Type"] - 1

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

return X_train, X_valid, y_train, y_valid


def preprocess_script():
"""
Expand Down
5 changes: 3 additions & 2 deletions rdagent/scenarios/kaggle/experiment/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@


class KGFBWorkspace(FBWorkspace):
def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
def __init__(self, template_folder_path: Path, *args, entry="python train.py", **kwargs) -> None:
super().__init__(*args, **kwargs)
self.inject_code_from_folder(template_folder_path)
self.data_description: List[Tuple[str, int]] = []
self.entry = entry # this is for debugging (you may want to change it into `sleep 1000`)

@property
def model_description(self) -> dict[str, str]:
Expand Down Expand Up @@ -85,7 +86,7 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str:

execute_log = kgde.run(
local_path=str(self.workspace_path),
entry=f"python train.py",
entry=self.entry,
env=run_env,
running_extra_volume=running_extra_volume,
)
Expand Down
14 changes: 13 additions & 1 deletion rdagent/scenarios/kaggle/kaggle_crawler.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# %%
import bisect
import json
import shutil
import subprocess
import time
import zipfile
Expand Down Expand Up @@ -124,7 +125,18 @@ def download_data(competition: str, local_path: str = KAGGLE_IMPLEMENT_SETTING.l
f"/bin/sh -c 'cp -r ./zip_files/{competition}/prepared/private/test.csv ./{competition}/valid.csv'",
local_path=local_path,
)

# NOTE:
# Patching: due to mle has special renaming mechanism for different competition;
# We have to switch the schema back to a uniform one;
if competition in {"new-york-city-taxi-fare-prediction"}:
cpath = Path(local_path) / f"{competition}"
labels_path = cpath / "labels.csv"
train_path = cpath / "train.csv"
if labels_path.exists():
shutil.copy(labels_path, train_path)
else:
logger.error(f"labels.csv not found in {cpath}")
raise FileNotFoundError(f"{labels_path} does not exist")
else:
zipfile_path = f"{local_path}/zip_files"
if not Path(f"{zipfile_path}/{competition}.zip").exists():
Expand Down
Loading

0 comments on commit a9caab7

Please sign in to comment.