Skip to content

Commit

Permalink
fix: unzip kaggle data (#464)
Browse files Browse the repository at this point in the history
* unzip kaggle data

* read local_data_path from .env file

* fix build docs error

* recover azure-identity packages

* optimize code logic

* add error when downloading data from kaggle

---------

Co-authored-by: Xu Yang <[email protected]>
  • Loading branch information
SunsetWolf and peteryang1 authored Nov 4, 2024
1 parent 83b3f78 commit 3a9fc8e
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 6 deletions.
1 change: 1 addition & 0 deletions constraints/3.10.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
azure-identity==1.17.1
dill==0.3.9
pillow==10.4.0
psutil==6.1.0
Expand Down
1 change: 1 addition & 0 deletions constraints/3.11.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
azure-identity==1.17.1
dill==0.3.9
pillow==10.4.0
psutil==6.1.0
Expand Down
2 changes: 1 addition & 1 deletion rdagent/app/kaggle/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class Config:
competition: str = ""
"""Kaggle competition name, e.g., 'sf-crime'"""

local_data_path: str = "/data/userdata/share/kaggle"
local_data_path: str = ""
"""Folder storing Kaggle competition data"""

if_action_choosing_based_on_UCB: bool = False
Expand Down
6 changes: 6 additions & 0 deletions rdagent/core/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,3 +42,9 @@ class ModelEmptyError(Exception):
"""
Exceptions raised when no model is generated correctly
"""


class KaggleError(Exception):
"""
Exceptions raised when calling Kaggle API
"""
28 changes: 23 additions & 5 deletions rdagent/scenarios/kaggle/kaggle_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from selenium.webdriver.common.by import By

from rdagent.app.kaggle.conf import KAGGLE_IMPLEMENT_SETTING
from rdagent.core.exception import KaggleError
from rdagent.core.prompts import Prompts
from rdagent.log import rdagent_logger as logger
from rdagent.oai.llm_utils import APIBackend
Expand Down Expand Up @@ -99,11 +100,28 @@ def kaggle_description_css_selectors() -> tuple[str, str]:
def download_data(competition: str, local_path: str = KAGGLE_IMPLEMENT_SETTING.local_data_path) -> None:
data_path = f"{local_path}/{competition}"
if not Path(data_path).exists():
subprocess.run(["kaggle", "competitions", "download", "-c", competition, "-p", data_path])

# unzip data
with zipfile.ZipFile(f"{data_path}/{competition}.zip", "r") as zip_ref:
zip_ref.extractall(data_path)
try:
subprocess.run(
["kaggle", "competitions", "download", "-c", competition, "-p", data_path],
check=True,
stderr=subprocess.PIPE,
stdout=subprocess.PIPE,
)
except subprocess.CalledProcessError as e:
logger.error(f"Download failed: {e}, stderr: {e.stderr}, stdout: {e.stdout}")
raise KaggleError(f"Download failed: {e}, stderr: {e.stderr}, stdout: {e.stdout}")

# unzip data
unzip_path = f"{local_path}/{competition}"
if not Path(unzip_path).exists():
unzip_data(unzip_file_path=f"{data_path}/{competition}.zip", unzip_target_path=unzip_path)
for sub_zip_file in Path(unzip_path).rglob("*.zip"):
unzip_data(sub_zip_file, unzip_target_path=unzip_path)


def unzip_data(unzip_file_path: str, unzip_target_path: str) -> None:
with zipfile.ZipFile(unzip_file_path, "r") as zip_ref:
zip_ref.extractall(unzip_target_path)


def leaderboard_scores(competition: str) -> list[float]:
Expand Down

0 comments on commit 3a9fc8e

Please sign in to comment.