Skip to content

Commit

Permalink
feat: add a new competition (#474)
Browse files Browse the repository at this point in the history
* add tabular-playground-series-dec-2021

* finished

* fix a mistake

* fix a bug

* fix a bug
  • Loading branch information
WinstonLiyt authored Nov 5, 2024
1 parent d41343a commit 2fc0d77
Show file tree
Hide file tree
Showing 11 changed files with 324 additions and 22 deletions.
3 changes: 2 additions & 1 deletion rdagent/scenarios/kaggle/docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ RUN pip install xgboost
RUN pip install sparse
RUN pip install lightgbm
RUN pip install pyarrow
RUN pip install fastparquet
RUN pip install fastparquet
RUN pip install optuna
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import os

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


def prepreprocess():
"""
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
"""
# Load and preprocess the data
data_df = pd.read_csv("/kaggle/input/train.csv")
data_df = data_df.drop(["Id"], axis=1)

X = data_df.drop(["Cover_Type"], axis=1)
y = data_df["Cover_Type"] - 1

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

return X_train, X_valid, y_train, y_valid


def preprocess_script():
"""
This method applies the preprocessing steps to the training, validation, and test datasets.
"""
if os.path.exists("/kaggle/input/X_train.pkl"):
X_train = pd.read_pickle("/kaggle/input/X_train.pkl")
X_valid = pd.read_pickle("/kaggle/input/X_valid.pkl")
y_train = pd.read_pickle("/kaggle/input/y_train.pkl")
y_valid = pd.read_pickle("/kaggle/input/y_valid.pkl")
X_test = pd.read_pickle("/kaggle/input/X_test.pkl")
others = pd.read_pickle("/kaggle/input/others.pkl")

return X_train, X_valid, y_train, y_valid, X_test, *others

X_train, X_valid, y_train, y_valid = prepreprocess()

# Load and preprocess the test data
submission_df = pd.read_csv("/kaggle/input/test.csv")
ids = submission_df["Id"]
X_test = submission_df.drop(["Id"], axis=1)

return X_train, X_valid, y_train, y_valid, X_test, ids


def clean_and_impute_data(X_train, X_valid, X_test):
"""
Handles inf and -inf values by replacing them with NaN,
then imputes missing values using the mean strategy.
Also removes duplicate columns.
"""
# Replace inf and -inf with NaN
X_train.replace([np.inf, -np.inf], np.nan, inplace=True)
X_valid.replace([np.inf, -np.inf], np.nan, inplace=True)
X_test.replace([np.inf, -np.inf], np.nan, inplace=True)

# Impute missing values
imputer = SimpleImputer(strategy="mean")
X_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)
X_valid = pd.DataFrame(imputer.transform(X_valid), columns=X_valid.columns)
X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)

# Remove duplicate columns
X_train = X_train.loc[:, ~X_train.columns.duplicated()]
X_valid = X_valid.loc[:, ~X_valid.columns.duplicated()]
X_test = X_test.loc[:, ~X_test.columns.duplicated()]

return X_train, X_valid, X_test
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import pandas as pd

"""
Here is the feature engineering code for each task, with a class that has a fit and transform method.
Remember
"""


class IdentityFeature:
def fit(self, train_df: pd.DataFrame):
"""
Fit the feature engineering model to the training data.
"""
pass

def transform(self, X: pd.DataFrame):
"""
Transform the input data.
"""
return X


feature_engineering_cls = IdentityFeature
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
"""
Motivation of the model:
The Random Forest model is chosen for its robustness and ability to handle large datasets with higher dimensionality.
It reduces overfitting by averaging multiple decision trees and typically performs well out of the box, making it a good
baseline model for many classification tasks.
"""

import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


def fit(X_train: pd.DataFrame, y_train: pd.Series, X_valid: pd.DataFrame, y_valid: pd.Series):
"""
Define and train the Random Forest model. Merge feature selection into the pipeline.
"""
# Initialize the Random Forest model
model = RandomForestClassifier(n_estimators=200, random_state=32, n_jobs=-1)

# Fit the model
model.fit(X_train, y_train)

# Validate the model
y_valid_pred = model.predict(X_valid)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"Validation Accuracy: {accuracy:.4f}")

return model


def predict(model, X):
"""
Keep feature selection's consistency and make predictions.
"""
# Predict using the trained model
y_pred = model.predict(X)

return y_pred.reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
"""
motivation of the model
"""

import pandas as pd
import xgboost as xgb


def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_valid: pd.DataFrame):
"""Define and train the model. Merge feature_select"""
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

params = {
"objective": "multi:softmax", # Use softmax for multi-class classification
"num_class": len(set(y_train)), # Number of classes
"nthread": -1,
"tree_method": "gpu_hist",
"device": "cuda",
}
num_round = 100

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)

return bst


def predict(model, X):
"""
Keep feature select's consistency.
"""
dtest = xgb.DMatrix(X)
y_pred = model.predict(dtest)
return y_pred.astype(int).reshape(-1, 1)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import pandas as pd


def select(X: pd.DataFrame) -> pd.DataFrame:
"""
Select relevant features. To be used in fit & predict function.
"""
# For now, we assume all features are relevant. This can be expanded to feature selection logic.
if X.columns.nlevels == 1:
return X
X.columns = ["_".join(str(i) for i in col).strip() for col in X.columns.values]
return X
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import importlib.util
import random
from pathlib import Path

import numpy as np
import pandas as pd
from fea_share_preprocess import clean_and_impute_data, preprocess_script
from sklearn.metrics import accuracy_score, matthews_corrcoef

# Set random seed for reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
DIRNAME = Path(__file__).absolute().resolve().parent


def compute_metrics_for_classification(y_true, y_pred):
"""Compute MCC for classification."""
mcc = matthews_corrcoef(y_true, y_pred)
return mcc


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
X_test_l = []

for f in DIRNAME.glob("feature/feat*.py"):
cls = import_module_from_path(f.stem, f).feature_engineering_cls()
cls.fit(X_train)
X_train_f = cls.transform(X_train)
X_valid_f = cls.transform(X_valid)
X_test_f = cls.transform(X_test)

if X_train_f.shape[-1] == X_valid_f.shape[-1] and X_train_f.shape[-1] == X_test_f.shape[-1]:
X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)
X_test_l.append(X_test_f)

X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
X_test = pd.concat(X_test_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_test_l))])

print(X_train.shape, X_valid.shape, X_test.shape)

# Handle inf and -inf values
X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)


model_l = [] # list[tuple[model, predict_func]]
for f in DIRNAME.glob("model/model*.py"):
select_python_path = f.with_name(f.stem.replace("model", "select") + f.suffix)
select_m = import_module_from_path(select_python_path.stem, select_python_path)
X_train_selected = select_m.select(X_train.copy())
X_valid_selected = select_m.select(X_valid.copy())

m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))

# 4) Evaluate the model on the validation set
metrics_all = []
for model, predict_func, select_m in model_l:
X_valid_selected = select_m.select(X_valid.copy())
y_valid_pred = predict_func(model, X_valid_selected)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"final accuracy on valid set: {accuracy}")
metrics_all.append(accuracy)

# 5) Save the validation accuracy
max_index = np.argmax(metrics_all)
pd.Series(data=[metrics_all[max_index]], index=["multi-class accuracy"]).to_csv("submission_score.csv")

# 6) Make predictions on the test set and save them
X_test_selected = model_l[max_index][2].select(X_test.copy())
y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1


# 7) Submit predictions for the test set
submission_result = pd.DataFrame(y_test_pred, columns=["Cover_Type"])
submission_result.insert(0, "Id", ids)

submission_result.to_csv("submission.csv", index=False)
36 changes: 15 additions & 21 deletions rdagent/scenarios/kaggle/kaggle_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,33 +100,27 @@ def kaggle_description_css_selectors() -> tuple[str, str]:
def download_data(competition: str, local_path: str = KAGGLE_IMPLEMENT_SETTING.local_data_path) -> None:
if KAGGLE_IMPLEMENT_SETTING.if_using_mle_data:
zipfile_path = f"{local_path}/zip_files"
if not Path(zipfile_path).exists():
zip_competition_path = Path(zipfile_path) / competition
if not zip_competition_path.exists():
try:
subprocess.run(
["mlebench", "prepare", "-c", competition, "-p", zipfile_path],
["mlebench", "prepare", "-c", competition, "--data-dir", zipfile_path],
check=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
)
except subprocess.CalledProcessError as e:
logger.error(f"Download failed: {e}, stderr: {e.stderr}, stdout: {e.stdout}")
raise KaggleError(f"Download failed: {e}, stderr: {e.stderr}, stdout: {e.stdout}")
# unzip data
unzip_path = Path(local_path) / f"{competition}_test"
if not unzip_path.exists():
unzip_data(unzip_file_path=f"{zipfile_path}/{competition}.zip", unzip_target_path=unzip_path)
for sub_zip_file in unzip_path.rglob("*.zip"):
unzip_data(sub_zip_file, unzip_target_path=unzip_path)

competition_path = Path(local_path) / competition
competition_path.mkdir(parents=True, exist_ok=True)
processed_data_folder_path = unzip_path / "prepared/public"
subprocess.run(f"cp -r {processed_data_folder_path}/* {competition_path}", shell=True)
subprocess.run(f"rm -rf {unzip_path}", shell=True)

competition_path = Path(local_path) / competition
competition_path.mkdir(parents=True, exist_ok=True)
processed_data_folder_path = zip_competition_path / "prepared/public"
subprocess.run(f"cp -r {processed_data_folder_path}/* {competition_path}", shell=True)

else:
zipfile_path = f"{local_path}/zip_files"
if not Path(zipfile_path).exists():
if not Path(f"{zipfile_path}/{competition}.zip").exists():
try:
subprocess.run(
["kaggle", "competitions", "download", "-c", competition, "-p", zipfile_path],
Expand All @@ -138,12 +132,12 @@ def download_data(competition: str, local_path: str = KAGGLE_IMPLEMENT_SETTING.l
logger.error(f"Download failed: {e}, stderr: {e.stderr}, stdout: {e.stdout}")
raise KaggleError(f"Download failed: {e}, stderr: {e.stderr}, stdout: {e.stdout}")

# unzip data
unzip_path = f"{local_path}/{competition}"
if not Path(unzip_path).exists():
unzip_data(unzip_file_path=f"{zipfile_path}/{competition}.zip", unzip_target_path=unzip_path)
for sub_zip_file in Path(unzip_path).rglob("*.zip"):
unzip_data(sub_zip_file, unzip_target_path=unzip_path)
# unzip data
unzip_path = f"{local_path}/{competition}"
if not Path(unzip_path).exists():
unzip_data(unzip_file_path=f"{zipfile_path}/{competition}.zip", unzip_target_path=unzip_path)
for sub_zip_file in Path(unzip_path).rglob("*.zip"):
unzip_data(sub_zip_file, unzip_target_path=unzip_path)


def unzip_data(unzip_file_path: str, unzip_target_path: str) -> None:
Expand Down

0 comments on commit 2fc0d77

Please sign in to comment.