-
-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: new-york-city-taxi-fare-prediction_template (#488)
* copy init version * feat: new-york-city-taxi-fare-prediction_template * add move to linear model * Add more details about docker * auto lint * auto lint with new black
- Loading branch information
Showing
15 changed files
with
339 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,6 @@ | |
|
||
|
||
class LLMHypothesisGen(HypothesisGen): | ||
|
||
def __init__(self, scen: Scenario): | ||
super().__init__(scen) | ||
|
||
|
75 changes: 75 additions & 0 deletions
75
...ios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/fea_share_preprocess.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import os | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from sklearn.impute import SimpleImputer | ||
from sklearn.model_selection import train_test_split | ||
|
||
index_name = "key" | ||
label_name = "fare_amount" | ||
|
||
|
||
def prepreprocess(): | ||
""" | ||
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets. | ||
""" | ||
# Load and preprocess the data | ||
data_df = pd.read_csv("/kaggle/input/train.csv") | ||
data_df = data_df.drop([index_name], axis=1) | ||
|
||
X = data_df.drop([label_name], axis=1) | ||
y = data_df[label_name] | ||
|
||
# Split the data into training and validation sets | ||
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42) | ||
|
||
return X_train, X_valid, y_train, y_valid | ||
|
||
|
||
def preprocess_script(): | ||
""" | ||
This method applies the preprocessing steps to the training, validation, and test datasets. | ||
""" | ||
if os.path.exists("/kaggle/input/X_train.pkl"): | ||
X_train = pd.read_pickle("/kaggle/input/X_train.pkl") | ||
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") | ||
y_train = pd.read_pickle("/kaggle/input/y_train.pkl") | ||
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") | ||
X_test = pd.read_pickle("/kaggle/input/X_test.pkl") | ||
others = pd.read_pickle("/kaggle/input/others.pkl") | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, *others | ||
|
||
X_train, X_valid, y_train, y_valid = prepreprocess() | ||
|
||
# Load and preprocess the test data | ||
submission_df = pd.read_csv("/kaggle/input/test.csv") | ||
ids = submission_df[index_name] | ||
X_test = submission_df.drop([index_name], axis=1) | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, ids | ||
|
||
|
||
def clean_and_impute_data(X_train, X_valid, X_test): | ||
""" | ||
Handles inf and -inf values by replacing them with NaN, | ||
then imputes missing values using the mean strategy. | ||
Also removes duplicate columns. | ||
""" | ||
# Replace inf and -inf with NaN | ||
X_train.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
X_test.replace([np.inf, -np.inf], np.nan, inplace=True) | ||
|
||
# Impute missing values | ||
imputer = SimpleImputer(strategy="mean") | ||
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns) | ||
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns) | ||
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns) | ||
|
||
# Remove duplicate columns | ||
X_train = X_train.loc[:, ~X_train.columns.duplicated()] | ||
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()] | ||
X_test = X_test.loc[:, ~X_test.columns.duplicated()] | ||
|
||
return X_train, X_valid, X_test |
30 changes: 30 additions & 0 deletions
30
...cenarios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/feature/feature.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
import pandas as pd | ||
|
||
""" | ||
Here is the feature engineering code for each task, with a class that has a fit and transform method. | ||
Remember | ||
""" | ||
|
||
|
||
class DatetimeFeature: | ||
def fit(self, train_df: pd.DataFrame): | ||
""" | ||
Fit the feature engineering model to the training data. | ||
""" | ||
pass | ||
|
||
def transform(self, X: pd.DataFrame): | ||
""" | ||
Transform the input data. | ||
""" | ||
X["pickup_datetime"] = pd.to_datetime(X["pickup_datetime"], format="%Y-%m-%d %H:%M:%S UTC") | ||
X["hour"] = X.pickup_datetime.dt.hour | ||
X["day"] = X.pickup_datetime.dt.day | ||
X["month"] = X.pickup_datetime.dt.month | ||
X["weekday"] = X.pickup_datetime.dt.weekday | ||
X["year"] = X.pickup_datetime.dt.year | ||
X.drop(columns=["pickup_datetime"], inplace=True) | ||
return X | ||
|
||
|
||
feature_engineering_cls = DatetimeFeature |
38 changes: 38 additions & 0 deletions
38
...arios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/model_linear.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
""" | ||
Motivation of the model: | ||
The Linear Regression model is chosen for its simplicity and interpretability. It is a good starting point for regression tasks | ||
and provides a baseline to compare more complex models against. Linear Regression assumes a linear relationship between the | ||
features and the target variable, which can be a reasonable assumption for many problems. | ||
""" | ||
|
||
import pandas as pd | ||
from sklearn.linear_model import LinearRegression | ||
from sklearn.metrics import mean_squared_error | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): | ||
""" | ||
Define and train the Linear Regression model. Merge feature selection into the pipeline. | ||
""" | ||
# Initialize the Linear Regression model | ||
model = LinearRegression() | ||
|
||
# Fit the model | ||
model.fit(X_train, y_train) | ||
|
||
# Validate the model | ||
y_valid_pred = model.predict(X_valid) | ||
mse = mean_squared_error(y_valid, y_valid_pred) | ||
print(f"Validation Mean Squared Error: {mse:.4f}") | ||
|
||
return model | ||
|
||
|
||
def predict(model, X): | ||
""" | ||
Keep feature selection's consistency and make predictions. | ||
""" | ||
# Predict using the trained model | ||
y_pred = model.predict(X) | ||
|
||
return y_pred.reshape(-1, 1) |
12 changes: 12 additions & 0 deletions
12
...os/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_lightgbm.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
12 changes: 12 additions & 0 deletions
12
...rios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_linear.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
12 changes: 12 additions & 0 deletions
12
...cenarios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_nn.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
12 changes: 12 additions & 0 deletions
12
...aggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_randomforest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
12 changes: 12 additions & 0 deletions
12
...ios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/model/select_xgboost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
91 changes: 91 additions & 0 deletions
91
rdagent/scenarios/kaggle/experiment/new-york-city-taxi-fare-prediction_template/train.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,91 @@ | ||
import importlib.util | ||
import random | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from fea_share_preprocess import clean_and_impute_data, preprocess_script | ||
from sklearn.metrics import matthews_corrcoef, root_mean_squared_error | ||
|
||
# Set random seed for reproducibility | ||
SEED = 42 | ||
random.seed(SEED) | ||
np.random.seed(SEED) | ||
DIRNAME = Path(__file__).absolute().resolve().parent | ||
|
||
|
||
def compute_metrics_for_classification(y_true, y_pred): | ||
"""Compute MCC for classification.""" | ||
mcc = matthews_corrcoef(y_true, y_pred) | ||
return mcc | ||
|
||
|
||
def import_module_from_path(module_name, module_path): | ||
spec = importlib.util.spec_from_file_location(module_name, module_path) | ||
module = importlib.util.module_from_spec(spec) | ||
spec.loader.exec_module(module) | ||
return module | ||
|
||
|
||
# 1) Preprocess the data | ||
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() | ||
|
||
# 2) Auto feature engineering | ||
X_train_l, X_valid_l = [], [] | ||
X_test_l = [] | ||
|
||
for f in DIRNAME.glob("feature/feat*.py"): | ||
cls = import_module_from_path(f.stem, f).feature_engineering_cls() | ||
cls.fit(X_train) | ||
X_train_f = cls.transform(X_train) | ||
X_valid_f = cls.transform(X_valid) | ||
X_test_f = cls.transform(X_test) | ||
|
||
if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: | ||
X_train_l.append(X_train_f) | ||
X_valid_l.append(X_valid_f) | ||
X_test_l.append(X_test_f) | ||
|
||
X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) | ||
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) | ||
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) | ||
|
||
print(X_train.shape, X_valid.shape, X_test.shape) | ||
|
||
# Handle inf and -inf values | ||
X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test) | ||
|
||
|
||
model_l = [] # list[tuple[model, predict_func]] | ||
for f in DIRNAME.glob("model/model*.py"): | ||
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) | ||
select_m = import_module_from_path(select_python_path.stem, select_python_path) | ||
X_train_selected = select_m.select(X_train.copy()) | ||
X_valid_selected = select_m.select(X_valid.copy()) | ||
|
||
m = import_module_from_path(f.stem, f) | ||
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m)) | ||
|
||
# 4) Evaluate the model on the validation set | ||
metrics_all = [] | ||
for model, predict_func, select_m in model_l: | ||
X_valid_selected = select_m.select(X_valid.copy()) | ||
y_valid_pred = predict_func(model, X_valid_selected) | ||
rmse = root_mean_squared_error(y_valid, y_valid_pred) | ||
print(f"final root mean squared error on valid set: {rmse}") | ||
metrics_all.append(rmse) | ||
|
||
# 5) Save the validation accuracy | ||
min_index = np.argmin(metrics_all) | ||
pd.Series(data=[metrics_all[min_index]], index=["root mean squared error"]).to_csv("submission_score.csv") | ||
|
||
# 6) Make predictions on the test set and save them | ||
X_test_selected = model_l[min_index][2].select(X_test.copy()) | ||
y_test_pred = model_l[min_index][1](model_l[min_index][0], X_test_selected).flatten() + 1 | ||
|
||
|
||
# 7) Submit predictions for the test set | ||
submission_result = pd.DataFrame(y_test_pred, columns=["fare_amount"]) | ||
submission_result.insert(0, "key", ids) | ||
|
||
submission_result.to_csv("submission.csv", index=False) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.