diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json new file mode 100644 index 000000000..0fbb13120 --- /dev/null +++ b/.vscode/c_cpp_properties.json @@ -0,0 +1,13 @@ +{ + "configurations": [ + { + "name": "Linux", + "includePath": [ + "${workspaceFolder}/**", + "/usr/local/cuda/include", + "/opt/rocm/include" + ] + } + ], + "version": 4 +} diff --git a/CITATION.cff b/CITATION.cff index c3409a923..419497d27 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,6 +1,6 @@ cff-version: 1.2.0 title: "ARK: A GPU-driven system framework for scalable AI applications" -version: 0.4.1 +version: 0.5.0 message: >- If you use this project in your research, please cite it as below. authors: diff --git a/CMakeLists.txt b/CMakeLists.txt index 1149d449d..7e7795fd9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,8 +2,8 @@ # Licensed under the MIT license. set(ARK_MAJOR "0") -set(ARK_MINOR "4") -set(ARK_PATCH "1") +set(ARK_MINOR "5") +set(ARK_PATCH "0") set(ARK_VERSION "${ARK_MAJOR}.${ARK_MINOR}.${ARK_PATCH}") set(ARK_SOVERSION "${ARK_MAJOR}.${ARK_MINOR}") diff --git a/README.md b/README.md index cff16fe24..769f972e4 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,7 @@ A GPU-driven system framework for scalable AI applications. | Unit Tests (ROCm) | [![Unit Tests (ROCm)](https://github.com/microsoft/ark/actions/workflows/ut-rocm.yml/badge.svg?branch=main)](https://github.com/microsoft/ark/actions/workflows/ut-rocm.yml) | *NOTE (Nov 2023): ROCm unit tests will be replaced into an Azure pipeline in the future.* + *NOTE (Dec 2023): ROCm unit tests are failing due to the nodes' issue. This will be fixed soon.* See [Quick Start](docs/quickstart.md) to quickly get started. @@ -29,18 +30,20 @@ ARK provides a set of APIs for users to express their distributed deep learning ARK is under active development and a part of its features will be added in a future release. The following describes key features of each version. -### New in ARK v0.4 (Latest Release) +### New in ARK v0.5 (Latest Release) -* Support AMD GPUs (CDNA2, single-GPU only) -* Add high-performance AllReduce & AllGather algorithms with MSCCL++ -* Fix major bugs in the scheduler +* Integrate with [MSCCL++](https://github.com/microsoft/mscclpp) +* Removed dependency on `gpudma` +* Add AMD CDNA3 architecture support +* Support communication for AMD GPUs +* Optimize OpGraph scheduling +* Add a multi-GPU Llama2 example -See details from https://github.com/microsoft/ark/issues/137. +See details from https://github.com/microsoft/ark/issues/168. -### ARK v0.5 (TBU, Dec. 2023) +### ARK v0.6 (TBU, Jan. 2024) -* Multi-GPU support for AMD GPUs -* Add multi-GPU LLM examples +* Overall performance optimization * Improve Python unit tests & code coverage ## Contributing diff --git a/ark/gpu/gpu_mem.cc b/ark/gpu/gpu_mem.cc index de214f52d..ad4cf100c 100644 --- a/ark/gpu/gpu_mem.cc +++ b/ark/gpu/gpu_mem.cc @@ -25,7 +25,7 @@ GpuMem::GpuMem(size_t bytes) { this->init(bytes); } GpuMem::GpuMem(const GpuMem::Info &info) { this->init(info); } // -void GpuMem::init(size_t bytes, bool expose) { +void GpuMem::init(size_t bytes, [[maybe_unused]] bool expose) { if (bytes == 0) { ERR(InvalidUsageError, "Tried to allocate zero byte."); } diff --git a/ark/include/ark.h b/ark/include/ark.h index 8546772cb..1df3a3322 100644 --- a/ark/include/ark.h +++ b/ark/include/ark.h @@ -10,8 +10,8 @@ #include #define ARK_MAJOR 0 -#define ARK_MINOR 4 -#define ARK_PATCH 1 +#define ARK_MINOR 5 +#define ARK_PATCH 0 #define ARK_VERSION (ARK_MAJOR * 10000 + ARK_MINOR * 100 + ARK_PATCH) namespace ark { diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake index fc9e3d2cc..701bf3462 100644 --- a/cmake/Utils.cmake +++ b/cmake/Utils.cmake @@ -24,10 +24,10 @@ endif() find_program(BLACK black) if(BLACK) add_custom_target(pylint - COMMAND python3.8 -m black --check --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR} + COMMAND python3 -m black --check --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR} ) add_custom_target(pylint-autofix - COMMAND python3.8 -m black --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR} + COMMAND python3 -m black --config ${PROJECT_SOURCE_DIR}/pyproject.toml ${PROJECT_SOURCE_DIR} ) else() message(STATUS "black not found.") diff --git a/docs/install.md b/docs/install.md index 579bc4b07..4fc252528 100644 --- a/docs/install.md +++ b/docs/install.md @@ -2,22 +2,12 @@ ## Prerequisites -* Linux kernel >= 4.15.0 - - - If you have a lower version, you can upgrade it via: - ```bash - sudo apt-get update - sudo apt-get install -y linux-image-4.15.0-13-generic linux-header-4.15.0-13-generic - ``` - * CMake >= 3.25.0 and Python >= 3.8 * Supported GPUs - NVIDIA GPUs: Volta (CUDA >= 11.1) / Ampere (CUDA >= 11.1) / Hopper (CUDA >= 12.0) - Hopper support will be added in the future. - - AMD GPUs: CDNA2 (ROCm >= 5.0) / CDNA3 - - Multi-GPU execution is not yet supported for AMD GPUs and will be supported by a future release. - - CDNA3 support will be added in the future. + - AMD GPUs: CDNA2 (ROCm >= 5.7) / CDNA3 (ROCm >= 5.7) * Mellanox OFED @@ -28,9 +18,9 @@ We currently provide only *base images* for ARK, which contain all the dependenc You can pull a base image as follows. ``` # For NVIDIA GPUs -docker pull ghcr.io/microsoft/ark/ark:base-dev-cuda12.1 +docker pull ghcr.io/microsoft/ark/ark:base-dev-cuda12.2 # For AMD GPUs -docker pull ghcr.io/microsoft/ark/ark:base-dev-rocm5.6 +docker pull ghcr.io/microsoft/ark/ark:base-dev-rocm5.7 ``` Check [ARK containers](https://github.com/microsoft/ark/pkgs/container/ark%2Fark) for all available Docker images. diff --git a/docs/sphinx/source/conf.py b/docs/sphinx/source/conf.py index 9ed76ab02..ab7992d4b 100644 --- a/docs/sphinx/source/conf.py +++ b/docs/sphinx/source/conf.py @@ -20,8 +20,8 @@ project = "ARK" copyright = "2023, ARK Team" author = "ARK Team" -version = "0.4.1" -release = "0.4.1" +version = "0.5.0" +release = "0.5.0" # -- General configuration --------------------------------------------------- diff --git a/pyproject.toml b/pyproject.toml index 1d3fe2598..516cc7c39 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "ark" -version = "0.4.1" +version = "0.5.0" [tool.scikit-build] cmake.minimum-version = "3.25" diff --git a/third_party/mscclpp b/third_party/mscclpp index 1cd862904..4681de9cd 160000 --- a/third_party/mscclpp +++ b/third_party/mscclpp @@ -1 +1 @@ -Subproject commit 1cd8629047fd0b2b4eb7c6c648924ff6f08bd9c0 +Subproject commit 4681de9cd914f7db2b7eb09b75f1e6735b95b75b