Skip to content

Commit

Permalink
add kaggle tpl (#482)
Browse files Browse the repository at this point in the history
  • Loading branch information
XianBW authored Nov 11, 2024
1 parent f3405ca commit 3ddba41
Show file tree
Hide file tree
Showing 10 changed files with 259 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
Expand All @@ -22,8 +22,8 @@ def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_vali

# Validate the model
y_valid_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {accuracy:.4f}")
auroc = roc_auc_score(y_valid, y_valid_pred)
print(f"Validation AUROC: {auroc:.4f}")

return model

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
"device": "cuda",
"tree_method": "hist",
"objective": "binary:logistic",
"eval_metric": "auc",
}
num_boost_round = 10

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import os

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, *others

train_df = pd.read_csv("/kaggle/input/train.csv")
test_df = pd.read_csv("/kaggle/input/test.csv")

X = train_df.drop(["pressure", "id"], axis=1)
y = train_df["pressure"]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=0)

# Load and preprocess the test data
ids = test_df["id"]
X_test = test_df.drop(["id"], axis=1)

return X_train, X_valid, y_train, y_valid, X_test, ids
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
Motivation of the model:
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
baseline model for many classification tasks.
"""

import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=32, n_jobs=-1)

# Fit the model
model.fit(X_train, y_train)

# Predict on the validation set
y_valid_pred = model.predict(X_valid)

# Calculate the mean absolute error on the validation set
mae = mean_absolute_error(y_valid, y_valid_pred)
print(f"Validation MAE of RandomForestRegressor: {mae}")

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Predict using the trained model
y_pred = model.predict(X)

return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""
motivation of the model
"""

import pandas as pd
import xgboost as xgb


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame) -> xgb.Booster:
"""Define and train the model. Merge feature_select"""
# 将数据转换为 DMatrix 并指定设备
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

params = {
"learning_rate": 0.1,
"subsample": 0.95,
"colsample_bytree": 0.11,
"max_depth": 2,
"booster": "gbtree",
"reg_lambda": 66.1,
"reg_alpha": 15.9,
"random_state": 42,
"tree_method": "hist",
"device": "cuda",
"eval_metric": "mae",
}
num_boost_round = 1000

model = xgb.train(params, dtrain, num_boost_round=num_boost_round, evals=[(dvalid, "validation")], verbose_eval=100)
return model


def predict(model: xgb.Booster, X):
"""
Keep feature select's consistency.
"""
dtest = xgb.DMatrix(X)
y_pred = model.predict(dtest)
return y_pred
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import importlib.util
import random
from pathlib import Path

import numpy as np
import pandas as pd
from fea_share_preprocess import preprocess_script
from sklearn.metrics import mean_absolute_error

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
X_test_f = cls.transform(X_test)

if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)
X_test_l.append(X_test_f)
print(f"Feature [{f.stem}] has been added to the feature list")

X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])


model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
select_m = import_module_from_path(select_python_path.stem, select_python_path)
X_train_selected = select_m.select(X_train.copy())
X_valid_selected = select_m.select(X_valid.copy())

m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))
print(f"Model [{f.stem}] has been trained")

# 4) Evaluate the model on the validation set
metrics_all = []
for model, predict_func, select_m in model_l:
X_valid_selected = select_m.select(X_valid.copy())
y_valid_pred = predict_func(model, X_valid_selected)
mae = mean_absolute_error(y_valid, y_valid_pred)
print(f"[{type(model).__name__}] MAE on valid set: {mae}")
metrics_all.append(mae)

# 5) Save the validation accuracy
max_index = np.argmin(metrics_all)
pd.Series(data=[metrics_all[max_index]], index=["MAE"]).to_csv("submission_score.csv")

# 6) Make predictions on the test set and save them
X_test_selected = model_l[max_index][2].select(X_test.copy())
y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1


# 7) Submit predictions for the test set
submission_result = pd.DataFrame(y_test_pred, columns=["pressure"])
submission_result.insert(0, "id", ids)

submission_result.to_csv("submission.csv", index=False)

0 comments on commit 3ddba41

Please sign in to comment.