-
-
Notifications
You must be signed in to change notification settings - Fork 89
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: (Kaggle) add base template for competition: tabular-playground-…
…series-may-2022 (#481) * add tpl kaggle * CI
- Loading branch information
Showing
8 changed files
with
250 additions
and
0 deletions.
There are no files selected for viewing
37 changes: 37 additions & 0 deletions
37
...ios/kaggle/experiment/tabular-playground-series-may-2022_template/fea_share_preprocess.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
import os | ||
|
||
import pandas as pd | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.preprocessing import MinMaxScaler | ||
|
||
|
||
def preprocess_script(): | ||
""" | ||
This method applies the preprocessing steps to the training, validation, and test datasets. | ||
""" | ||
if os.path.exists("/kaggle/input/X_train.pkl"): | ||
X_train = pd.read_pickle("/kaggle/input/X_train.pkl") | ||
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl") | ||
y_train = pd.read_pickle("/kaggle/input/y_train.pkl") | ||
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl") | ||
X_test = pd.read_pickle("/kaggle/input/X_test.pkl") | ||
others = pd.read_pickle("/kaggle/input/others.pkl") | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, *others | ||
|
||
train_df = pd.read_csv("/kaggle/input/train.csv") | ||
test_df = pd.read_csv("/kaggle/input/test.csv") | ||
|
||
x = train_df.drop(columns=["target", "id", "f_27"]) | ||
y = train_df["target"] | ||
scaler = MinMaxScaler() | ||
x_scaled = pd.DataFrame(scaler.fit_transform(x)) | ||
|
||
X_train, X_valid, y_train, y_valid = train_test_split(x_scaled, y, test_size=0.20, random_state=101) | ||
|
||
# Load and preprocess the test data | ||
ids = test_df["id"] | ||
X_test = test_df.drop(["id", "f_27"], axis=1) | ||
X_test = pd.DataFrame(scaler.transform(X_test)) | ||
|
||
return X_train, X_valid, y_train, y_valid, X_test, ids |
23 changes: 23 additions & 0 deletions
23
...cenarios/kaggle/experiment/tabular-playground-series-may-2022_template/feature/feature.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
import pandas as pd | ||
|
||
""" | ||
Here is the feature engineering code for each task, with a class that has a fit and transform method. | ||
Remember | ||
""" | ||
|
||
|
||
class IdentityFeature: | ||
def fit(self, train_df: pd.DataFrame): | ||
""" | ||
Fit the feature engineering model to the training data. | ||
""" | ||
pass | ||
|
||
def transform(self, X: pd.DataFrame): | ||
""" | ||
Transform the input data. | ||
""" | ||
return X | ||
|
||
|
||
feature_engineering_cls = IdentityFeature |
38 changes: 38 additions & 0 deletions
38
...kaggle/experiment/tabular-playground-series-may-2022_template/model/model_randomforest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
""" | ||
Motivation of the model: | ||
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality. | ||
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good | ||
baseline model for many classification tasks. | ||
""" | ||
|
||
import pandas as pd | ||
from sklearn.ensemble import RandomForestClassifier | ||
from sklearn.metrics import accuracy_score | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series): | ||
""" | ||
Define and train the Random Forest model. Merge feature selection into the pipeline. | ||
""" | ||
# Initialize the Random Forest model | ||
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1) | ||
|
||
# Fit the model | ||
model.fit(X_train, y_train) | ||
|
||
# Validate the model | ||
y_valid_pred = model.predict(X_valid) | ||
accuracy = accuracy_score(y_valid, y_valid_pred) | ||
print(f"Validation Accuracy: {accuracy:.4f}") | ||
|
||
return model | ||
|
||
|
||
def predict(model, X): | ||
""" | ||
Keep feature selection's consistency and make predictions. | ||
""" | ||
# Predict using the trained model | ||
y_pred = model.predict(X) | ||
|
||
return y_pred.reshape(-1, 1) |
34 changes: 34 additions & 0 deletions
34
...rios/kaggle/experiment/tabular-playground-series-may-2022_template/model/model_xgboost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
""" | ||
motivation of the model | ||
""" | ||
|
||
import pandas as pd | ||
import xgboost as xgb | ||
|
||
|
||
def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> xgb.Booster: | ||
"""Define and train the model. Merge feature_select""" | ||
# 将数据转换为 DMatrix 并指定设备 | ||
dtrain = xgb.DMatrix(X_train, label=y_train) | ||
dvalid = xgb.DMatrix(X_valid, label=y_valid) | ||
|
||
params = { | ||
"learning_rate": 0.5, | ||
"max_depth": 10, | ||
"device": "cuda", | ||
"tree_method": "hist", | ||
"objective": "binary:logistic", | ||
} | ||
num_boost_round = 10 | ||
|
||
model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dvalid, "validation")], verbose_eval=100) | ||
return model | ||
|
||
|
||
def predict(model: xgb.Booster, X): | ||
""" | ||
Keep feature select's consistency. | ||
""" | ||
dtest = xgb.DMatrix(X) | ||
y_pred = pd.Series([round(v) for v in model.predict(dtest)]) | ||
return y_pred |
12 changes: 12 additions & 0 deletions
12
...cenarios/kaggle/experiment/tabular-playground-series-may-2022_template/model/select_nn.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
12 changes: 12 additions & 0 deletions
12
...aggle/experiment/tabular-playground-series-may-2022_template/model/select_randomforest.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
12 changes: 12 additions & 0 deletions
12
...ios/kaggle/experiment/tabular-playground-series-may-2022_template/model/select_xgboost.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
import pandas as pd | ||
|
||
|
||
def select(X: pd.DataFrame) -> pd.DataFrame: | ||
""" | ||
Select relevant features. To be used in fit & predict function. | ||
""" | ||
# For now, we assume all features are relevant. This can be expanded to feature selection logic. | ||
if X.columns.nlevels == 1: | ||
return X | ||
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values] | ||
return X |
82 changes: 82 additions & 0 deletions
82
rdagent/scenarios/kaggle/experiment/tabular-playground-series-may-2022_template/train.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
import importlib.util | ||
import random | ||
from pathlib import Path | ||
|
||
import numpy as np | ||
import pandas as pd | ||
from fea_share_preprocess import preprocess_script | ||
from sklearn.metrics import roc_auc_score | ||
|
||
# Set random seed for reproducibility | ||
SEED = 42 | ||
random.seed(SEED) | ||
np.random.seed(SEED) | ||
DIRNAME = Path(__file__).absolute().resolve().parent | ||
|
||
|
||
def import_module_from_path(module_name, module_path): | ||
spec = importlib.util.spec_from_file_location(module_name, module_path) | ||
module = importlib.util.module_from_spec(spec) | ||
spec.loader.exec_module(module) | ||
return module | ||
|
||
|
||
# 1) Preprocess the data | ||
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script() | ||
|
||
# 2) Auto feature engineering | ||
X_train_l, X_valid_l = [], [] | ||
X_test_l = [] | ||
|
||
for f in DIRNAME.glob("feature/feat*.py"): | ||
cls = import_module_from_path(f.stem, f).feature_engineering_cls() | ||
cls.fit(X_train) | ||
X_train_f = cls.transform(X_train) | ||
X_valid_f = cls.transform(X_valid) | ||
X_test_f = cls.transform(X_test) | ||
|
||
if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]: | ||
X_train_l.append(X_train_f) | ||
X_valid_l.append(X_valid_f) | ||
X_test_l.append(X_test_f) | ||
print(f"Feature [{f.stem}] has been added to the feature list") | ||
|
||
X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))]) | ||
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))]) | ||
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))]) | ||
|
||
|
||
model_l = [] # list[tuple[model, predict_func]] | ||
for f in DIRNAME.glob("model/model*.py"): | ||
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix) | ||
select_m = import_module_from_path(select_python_path.stem, select_python_path) | ||
X_train_selected = select_m.select(X_train.copy()) | ||
X_valid_selected = select_m.select(X_valid.copy()) | ||
|
||
m = import_module_from_path(f.stem, f) | ||
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m)) | ||
print(f"Model [{f.stem}] has been trained") | ||
|
||
# 4) Evaluate the model on the validation set | ||
metrics_all = [] | ||
for model, predict_func, select_m in model_l: | ||
X_valid_selected = select_m.select(X_valid.copy()) | ||
y_valid_pred = predict_func(model, X_valid_selected) | ||
auroc = roc_auc_score(y_valid, y_valid_pred) | ||
print(f"[{type(model).__name__}] AUROC on valid set: {auroc}") | ||
metrics_all.append(auroc) | ||
|
||
# 5) Save the validation accuracy | ||
max_index = np.argmax(metrics_all) | ||
pd.Series(data=[metrics_all[max_index]], index=["AUROC"]).to_csv("submission_score.csv") | ||
|
||
# 6) Make predictions on the test set and save them | ||
X_test_selected = model_l[max_index][2].select(X_test.copy()) | ||
y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1 | ||
|
||
|
||
# 7) Submit predictions for the test set | ||
submission_result = pd.DataFrame(y_test_pred, columns=["target"]) | ||
submission_result.insert(0, "id", ids) | ||
|
||
submission_result.to_csv("submission.csv", index=False) |