Skip to content

Commit

Permalink
feat: kaggle refactor (#489)
Browse files Browse the repository at this point in the history
* init trail

* Add spec info

* auto unzip mlebench prepared data for out scenario

* successfully run example

* successfully run main

* simplify load traing

* extract load_from_raw_data

* split the fies(still buggy)
It should stop on ~20 epoch and reach the end

* some changes

* Fix bug to run example

* (success) until feature

* refine model and ensemble

* add metrics in ens.py

* update README &  spec.md

* ens change

* fix ens bug

* Delete rdagent/scenarios/kaggle/tpl_ex/aerial-cactus-identification/train.py

* add template_path in KG_conf

* fix test kaggle

* CI

* make test_import not check kaggle template codes

---------

Co-authored-by: Bowen Xian <[email protected]>
  • Loading branch information
you-n-g and XianBW authored Nov 20, 2024
1 parent a9caab7 commit 1b057d0
Show file tree
Hide file tree
Showing 184 changed files with 588 additions and 11 deletions.
3 changes: 3 additions & 0 deletions rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@ class Config:
competition: str = ""
"""Kaggle competition name, e.g., 'sf-crime'"""

template_path: str = "rdagent/scenarios/kaggle/experiment/templates"
"""Kaggle competition base templates path"""

local_data_path: str = ""
"""Folder storing Kaggle competition data"""

Expand Down
8 changes: 6 additions & 2 deletions rdagent/scenarios/kaggle/experiment/kaggle_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,9 @@ class KGModelExperiment(ModelExperiment[ModelTask, KGFBWorkspace, ModelFBWorkspa
def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.experiment_workspace = KGFBWorkspace(
template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
template_folder_path=Path(__file__).resolve()
/ Path(KAGGLE_IMPLEMENT_SETTING.template_path).resolve()
/ KAGGLE_IMPLEMENT_SETTING.competition
)
if len(self.based_experiments) > 0:
self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
Expand All @@ -62,7 +64,9 @@ class KGFactorExperiment(FeatureExperiment[FactorTask, KGFBWorkspace, FactorFBWo
def __init__(self, *args, source_feature_size: int = None, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.experiment_workspace = KGFBWorkspace(
template_folder_path=Path(__file__).parent / f"{KAGGLE_IMPLEMENT_SETTING.competition}_template"
template_folder_path=Path(__file__).resolve()
/ Path(KAGGLE_IMPLEMENT_SETTING.template_path).resolve()
/ KAGGLE_IMPLEMENT_SETTING.competition
)
if len(self.based_experiments) > 0:
self.experiment_workspace.inject_code(**self.based_experiments[-1].experiment_workspace.code_dict)
Expand Down
4 changes: 1 addition & 3 deletions rdagent/scenarios/kaggle/experiment/workspace.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,10 @@


class KGFBWorkspace(FBWorkspace):
def __init__(self, template_folder_path: Path, *args, entry="python train.py", **kwargs) -> None:
def __init__(self, template_folder_path: Path, *args, **kwargs) -> None:
super().__init__(*args, **kwargs)
self.inject_code_from_folder(template_folder_path)
self.data_description: List[Tuple[str, int]] = []
self.entry = entry # this is for debugging (you may want to change it into `sleep 1000`)

@property
def model_description(self) -> dict[str, str]:
Expand Down Expand Up @@ -86,7 +85,6 @@ def execute(self, run_env: dict = {}, *args, **kwargs) -> str:

execute_log = kgde.run(
local_path=str(self.workspace_path),
entry=self.entry,
env=run_env,
running_extra_volume=running_extra_volume,
)
Expand Down
2 changes: 1 addition & 1 deletion rdagent/scenarios/kaggle/kaggle_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def download_data(competition: str, local_path: str = KAGGLE_IMPLEMENT_SETTING.l
f"/bin/sh -c 'cp -r ./zip_files/{competition}/prepared/public/* ./{competition}'", local_path=local_path
)
mleb_env.run(
f"/bin/sh -c 'cp -r ./zip_files/{competition}/prepared/private/test.csv ./{competition}/valid.csv'",
f'/bin/sh -c \'for zip_file in ./{competition}/*.zip; do dir_name="${{zip_file%.zip}}"; mkdir -p "$dir_name"; unzip -o "$zip_file" -d "$dir_name"; done\'',
local_path=local_path,
)
# NOTE:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
# Motivation of the example
We use a runnable concrete example to demonstrate what the project should be like after being generated by a large language model.


# Content example and the workflow

> NOTE: the `README.md` itself is note generated by LLM. the content remains are generated by LLM.
>

## Extra input information beyond the competition information

[[../meta/spec.md]]
- [ ] TODO

## Step0: Specification generation

- Generate specification
[[spec.md]]
- [ ] TODO: perfect
- Generate loading data
[[load_data.py]]

- Why do we merge this step together.
- Successfully run `load_data.py` is a kind of verification of `spec.md`


## Step1: write the feature engineering code
- We can generate some file like [[feat01.py]] that match the pattern `feat.*\.py`

## Step2: Model training


## Step3: ensemble and decision
- generate `ens_and_decsion`
- why we generate score on ensemble phase
- ensemble has following tasks which has great overlap
- ensemble usually check the performance before ensemble
- A additional step to record performance is easier.

## Step4: Build workflow

[[main.py]]
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score


def ens_and_decision(test_pred_l: list[np.ndarray], val_pred_l: list[np.ndarray], val_label: np.ndarray) -> np.ndarray:
"""
Handle the following:
1) Ensemble predictions using a simple average.
2) Make final decision after ensemble (convert the predictions to final binary form).
Parameters
----------
test_pred_l : list[np.ndarray]
List of predictions on the test data.
val_pred_l : list[np.ndarray]
List of predictions on the validation data.
val_label : np.ndarray
True labels of the validation data.
Returns
-------
np.ndarray
Binary predictions on the test data.
"""

scores = []
for id, val_pred in enumerate(val_pred_l):
scores.append(roc_auc_score(val_label, val_pred))

# Normalize the scores to get weights
total_score = sum(scores)
weights = [score / total_score for score in scores]

# Weighted average of test predictions
weighted_test_pred = np.zeros_like(test_pred_l[0])
for weight, test_pred in zip(weights, test_pred_l):
weighted_test_pred += weight * test_pred

weighted_valid_pred = np.zeros_like(val_pred_l[0])
for weight, val_pred in zip(weights, val_pred_l):
weighted_valid_pred += weight * val_pred

weighted_valid_pred_score = roc_auc_score(val_label, weighted_valid_pred)

scores_df = pd.DataFrame(
{
"Model": list(range(len(val_pred_l))) + ["weighted_average_ensemble"],
"AUROC": scores + [weighted_valid_pred_score],
}
)
scores_df.to_csv("scores.csv", index=False)

pred_binary_l = [0 if value < 0.50 else 1 for value in weighted_test_pred]
return np.array(pred_binary_l)
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import numpy as np


def feat_eng(
X: np.ndarray,
y: np.ndarray | None = None,
X_fit: np.ndarray | None = None,
y_fit: np.ndarray | None = None,
param: object | None = None,
) -> tuple[np.ndarray, np.ndarray | None, object]:
"""
Perform feature engineering on the input data.
Parameters:
- X: np.ndarray
The input data to be transformed. A concrete example could be:
array([[[[207, 194, 203],
...,
[191, 183, 164],
[176, 168, 149],
[181, 173, 152]]]], dtype=uint8)
- y: np.ndarray | None
The target data. A concrete example could be:
array([1, 0, 1, 0, 1, 1, ..., ])
- X_fit: np.ndarray | None
Data for fitting the transformation parameters.
- y_fit: np.ndarray | None
Target data for fitting.
- param: object | None
Pre-fitted parameters for transformation.
Returns:
- transformed_data: np.ndarray
Transformed data.
- transformed_target: np.ndarray | None
Transformed target data.
- fitted_param: object
Fitted parameters.
Notes:
- Some preprocessing (e.g., data selection) is based on y.
Typical usage:
.. code-block:: python
X_transformed, y_transformed, fitted_param = feat_eng(X, y, X, y)
X_test_transformed, _, _ = feat_eng(X_test, fitted_param)
"""
# This is an example of identity feature transformation.
# We'll not change the content of the data, but we'll demonstrate the typical workflow of feature engineering.
if param is None:
# Get parameters from the X_fit and y_fit
pass
# Use the fitted parameters to transform the data X, y
return X, y, param
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""
Load competition data to uniform format
"""

import os

import numpy as np
import pandas as pd
from PIL import Image


def load_test_images(folder):
images = []
filenames = []
for filename in os.listdir(folder):
img = Image.open(os.path.join(folder, filename))
if img is not None:
images.append(np.array(img))
filenames.append(filename)
return np.array(images), filenames


def load_images_and_labels(csv_file, image_folder):
images = []
labels = []
df = pd.read_csv(csv_file)
for idx, row in df.iterrows():
img = Image.open(os.path.join(image_folder, row["id"]))
if img is not None:
images.append(np.array(img))
labels.append(row["has_cactus"])
return np.array(images), np.array(labels)


def load_from_raw_data() -> tuple[np.ndarray, np.ndarray, np.ndarray, list[str]]:
"""
load raw data from disk to get data in uniform data
Return:
X: np.array
a concrete example could be:
.. code-block:: text
array([[[[207, 194, 203],
...,
[191, 183, 164],
[176, 168, 149],
[181, 173, 152]]]], dtype=uint8)
y: np.array
a concrete example could be:
.. code-block:: python
array([1, 0, 1, 0, 1, 1, ..., ])
X_test: np.array
a concrete example is similar to `X`.
test_ids: the id representing the image. it is used to generate the submission file
a concrete example could be:
.. code-block:: python
['1398ad045aa57aee5f38e7661e9d49e8.jpg',
'0051207eb794887c619341090de84b50.jpg',
'a8202dd82c42e252bef921ada7607b6c.jpg',
'76c329ff9e3c5036b616f4e88ebba814.jpg',
...]
"""
X, y = load_images_and_labels("/kaggle/input/train.csv", "/kaggle/input/train/")

test_folder = "/kaggle/input/test/"
X_test, test_filenames = load_test_images(test_folder)
# Store filenames separately
test_ids = [os.path.basename(filename).replace(".tif", "") for filename in test_filenames]
return X, y, X_test, test_ids
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from load_data import load_from_raw_data
from sklearn.model_selection import train_test_split

# Load data
train_images, train_labels, test_images, test_ids = load_from_raw_data()


# feature engineering
from feat01 import feat_eng

train_images, train_lables, train_param = feat_eng(train_images, train_labels)
test_images, _, _ = feat_eng(test_images, param=train_param)


# (Cross) Validation
train_images, validation_images, train_labels, validation_labels = train_test_split(
train_images, train_labels, test_size=0.1, random_state=42
)


# Model workflow
from model01 import model_workflow

val_pred, test_pred = model_workflow(train_images, train_labels, validation_images, validation_labels, test_images)


# Ensemble
from ens import ens_and_decision

pred_binary = ens_and_decision([test_pred], [val_pred], validation_labels)


# Save
with open("submission.csv", "w") as csv_file:
csv_file.write("id,has_cactus\n")
for tid, prediction in zip(test_ids, pred_binary):
csv_file.write(f"{tid},{prediction}\n")
Loading

0 comments on commit 1b057d0

Please sign in to comment.