-
-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
10 changed files
with
259 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
34 changes: 34 additions & 0 deletions
34
...enarios/kaggle/experiment/ventilator-pressure-prediction_template/fea_share_preprocess.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import os | ||
|
||
import pandas as pd | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import MinMaxScaler | ||
|
||
|
||
def preprocess_script(): | ||
""" | ||
This method applies the preprocessing steps to the training, validation, and test datasets. | ||
""" | ||
if os.path.exists("/kaggle/input/X_train.pkl"): | ||
X_train = pd.read_pickle("/kaggle/input/X_train.pkl") | ||
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") | ||
y_train = pd.read_pickle("/kaggle/input/y_train.pkl") | ||
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") | ||
X_test = pd.read_pickle("/kaggle/input/X_test.pkl") | ||
others = pd.read_pickle("/kaggle/input/others.pkl") | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, *others | ||
|
||
train_df = pd.read_csv("/kaggle/input/train.csv") | ||
test_df = pd.read_csv("/kaggle/input/test.csv") | ||
|
||
X = train_df.drop(["pressure", "id"], axis=1) | ||
y = train_df["pressure"] | ||
|
||
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0) | ||
|
||
# Load and preprocess the test data | ||
ids = test_df["id"] | ||
X_test = test_df.drop(["id"], axis=1) | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, ids |
23 changes: 23 additions & 0 deletions
23
...nt/scenarios/kaggle/experiment/ventilator-pressure-prediction_template/feature/feature.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import pandas as pd | ||
|
||
""" | ||
Here is the feature engineering code for each task, with a class that has a fit and transform method. | ||
Remember | ||
""" | ||
|
||
|
||
class IdentityFeature: | ||
def fit(self, train_df: pd.DataFrame): | ||
""" | ||
Fit the feature engineering model to the training data. | ||
""" | ||
pass | ||
|
||
def transform(self, X: pd.DataFrame): | ||
""" | ||
Transform the input data. | ||
""" | ||
return X | ||
|
||
|
||
feature_engineering_cls = IdentityFeature |
40 changes: 40 additions & 0 deletions
40
...ios/kaggle/experiment/ventilator-pressure-prediction_template/model/model_randomforest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
""" | ||
Motivation of the model: | ||
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality. | ||
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good | ||
baseline model for many classification tasks. | ||
""" | ||
|
||
import pandas as pd | ||
from sklearn.ensemble import RandomForestRegressor | ||
from sklearn.metrics import mean_absolute_error | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): | ||
""" | ||
Define and train the Random Forest model. Merge feature selection into the pipeline. | ||
""" | ||
# Initialize the Random Forest model | ||
model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1) | ||
|
||
# Fit the model | ||
model.fit(X_train, y_train) | ||
|
||
# Predict on the validation set | ||
y_valid_pred = model.predict(X_valid) | ||
|
||
# Calculate the mean absolute error on the validation set | ||
mae = mean_absolute_error(y_valid, y_valid_pred) | ||
print(f"Validation MAE of RandomForestRegressor: {mae}") | ||
|
||
return model | ||
|
||
|
||
def predict(model, X): | ||
""" | ||
Keep feature selection's consistency and make predictions. | ||
""" | ||
# Predict using the trained model | ||
y_pred = model.predict(X) | ||
|
||
return y_pred.reshape(-1, 1) |
40 changes: 40 additions & 0 deletions
40
...cenarios/kaggle/experiment/ventilator-pressure-prediction_template/model/model_xgboost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
""" | ||
motivation of the model | ||
""" | ||
|
||
import pandas as pd | ||
import xgboost as xgb | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> xgb.Booster: | ||
"""Define and train the model. Merge feature_select""" | ||
# 将数据转换为 DMatrix 并指定设备 | ||
dtrain = xgb.DMatrix(X_train, label=y_train) | ||
dvalid = xgb.DMatrix(X_valid, label=y_valid) | ||
|
||
params = { | ||
"learning_rate": 0.1, | ||
"subsample": 0.95, | ||
"colsample_bytree": 0.11, | ||
"max_depth": 2, | ||
"booster": "gbtree", | ||
"reg_lambda": 66.1, | ||
"reg_alpha": 15.9, | ||
"random_state": 42, | ||
"tree_method": "hist", | ||
"device": "cuda", | ||
"eval_metric": "mae", | ||
} | ||
num_boost_round = 1000 | ||
|
||
model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dvalid, "validation")], verbose_eval=100) | ||
return model | ||
|
||
|
||
def predict(model: xgb.Booster, X): | ||
""" | ||
Keep feature select's consistency. | ||
""" | ||
dtest = xgb.DMatrix(X) | ||
y_pred = model.predict(dtest) | ||
return y_pred |
12 changes: 12 additions & 0 deletions
12
...nt/scenarios/kaggle/experiment/ventilator-pressure-prediction_template/model/select_nn.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
12 changes: 12 additions & 0 deletions
12
...os/kaggle/experiment/ventilator-pressure-prediction_template/model/select_randomforest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
12 changes: 12 additions & 0 deletions
12
...enarios/kaggle/experiment/ventilator-pressure-prediction_template/model/select_xgboost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
82 changes: 82 additions & 0 deletions
82
rdagent/scenarios/kaggle/experiment/ventilator-pressure-prediction_template/train.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import importlib.util | ||
import random | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from fea_share_preprocess import preprocess_script | ||
from sklearn.metrics import mean_absolute_error | ||
|
||
# Set random seed for reproducibility | ||
SEED = 42 | ||
random.seed(SEED) | ||
np.random.seed(SEED) | ||
DIRNAME = Path(__file__).absolute().resolve().parent | ||
|
||
|
||
def import_module_from_path(module_name, module_path): | ||
spec = importlib.util.spec_from_file_location(module_name, module_path) | ||
module = importlib.util.module_from_spec(spec) | ||
spec.loader.exec_module(module) | ||
return module | ||
|
||
|
||
# 1) Preprocess the data | ||
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() | ||
|
||
# 2) Auto feature engineering | ||
X_train_l, X_valid_l = [], [] | ||
X_test_l = [] | ||
|
||
for f in DIRNAME.glob("feature/feat*.py"): | ||
cls = import_module_from_path(f.stem, f).feature_engineering_cls() | ||
cls.fit(X_train) | ||
X_train_f = cls.transform(X_train) | ||
X_valid_f = cls.transform(X_valid) | ||
X_test_f = cls.transform(X_test) | ||
|
||
if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: | ||
X_train_l.append(X_train_f) | ||
X_valid_l.append(X_valid_f) | ||
X_test_l.append(X_test_f) | ||
print(f"Feature [{f.stem}] has been added to the feature list") | ||
|
||
X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) | ||
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) | ||
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) | ||
|
||
|
||
model_l = [] # list[tuple[model, predict_func]] | ||
for f in DIRNAME.glob("model/model*.py"): | ||
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) | ||
select_m = import_module_from_path(select_python_path.stem, select_python_path) | ||
X_train_selected = select_m.select(X_train.copy()) | ||
X_valid_selected = select_m.select(X_valid.copy()) | ||
|
||
m = import_module_from_path(f.stem, f) | ||
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m)) | ||
print(f"Model [{f.stem}] has been trained") | ||
|
||
# 4) Evaluate the model on the validation set | ||
metrics_all = [] | ||
for model, predict_func, select_m in model_l: | ||
X_valid_selected = select_m.select(X_valid.copy()) | ||
y_valid_pred = predict_func(model, X_valid_selected) | ||
mae = mean_absolute_error(y_valid, y_valid_pred) | ||
print(f"[{type(model).__name__}] MAE on valid set: {mae}") | ||
metrics_all.append(mae) | ||
|
||
# 5) Save the validation accuracy | ||
max_index = np.argmin(metrics_all) | ||
pd.Series(data=[metrics_all[max_index]], index=["MAE"]).to_csv("submission_score.csv") | ||
|
||
# 6) Make predictions on the test set and save them | ||
X_test_selected = model_l[max_index][2].select(X_test.copy()) | ||
y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1 | ||
|
||
|
||
# 7) Submit predictions for the test set | ||
submission_result = pd.DataFrame(y_test_pred, columns=["pressure"]) | ||
submission_result.insert(0, "id", ids) | ||
|
||
submission_result.to_csv("submission.csv", index=False) |